diff --git a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Ailk_Bjlk_SB.yaml index 2b30dbc50..302e95c5e 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Ailk_Bjlk_SB.yaml @@ -16658,8 +16658,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -16822,8 +16822,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -16982,8 +16982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17146,8 +17146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17306,8 +17306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17470,8 +17470,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17630,8 +17630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17790,8 +17790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17950,8 +17950,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18114,8 +18114,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18274,8 +18274,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18434,8 +18434,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18594,8 +18594,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18758,8 +18758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18925,8 +18925,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19086,8 +19086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19247,8 +19247,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19412,8 +19412,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19573,8 +19573,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19734,8 +19734,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19895,8 +19895,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20056,8 +20056,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20221,8 +20221,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20386,8 +20386,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20547,8 +20547,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20708,8 +20708,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20869,8 +20869,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21030,8 +21030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21191,8 +21191,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21352,8 +21352,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21513,8 +21513,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21674,8 +21674,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21835,8 +21835,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21996,8 +21996,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22157,8 +22157,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22322,8 +22322,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22487,8 +22487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22650,8 +22650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22817,8 +22817,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22982,8 +22982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23145,8 +23145,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23312,8 +23312,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23475,8 +23475,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23642,8 +23642,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23807,8 +23807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23970,8 +23970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24137,8 +24137,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24300,8 +24300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24467,8 +24467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24630,8 +24630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24797,8 +24797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24966,8 +24966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25133,8 +25133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25298,8 +25298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25347,11 +25347,11 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25362,8 +25362,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -25371,31 +25371,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25409,10 +25406,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25420,26 +25417,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25449,6 +25454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25458,6 +25464,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25472,35 +25479,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 166 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id007 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25511,40 +25526,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25558,10 +25570,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25569,26 +25581,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25598,6 +25618,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25607,6 +25628,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25621,35 +25643,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 167 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25659,41 +25689,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25707,10 +25738,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25718,19 +25749,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -25738,6 +25776,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25747,6 +25786,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25756,6 +25796,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25770,79 +25811,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 168 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25855,11 +25905,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25869,17 +25919,24 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -25887,6 +25944,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25896,6 +25954,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25905,6 +25964,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25919,35 +25979,74623 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 169 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 293 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 298 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 299 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 300 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 301 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 302 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 303 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 304 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 305 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 306 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 307 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 308 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 309 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 310 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 311 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 312 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 313 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 314 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 315 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 316 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 317 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 318 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 319 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 320 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 321 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 322 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 323 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 324 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 325 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 326 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 327 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 328 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 329 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 330 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 331 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 332 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 333 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 334 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 335 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 336 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 337 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 338 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 339 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 340 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 341 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 342 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 343 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 344 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 345 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 346 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 347 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 348 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 349 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 350 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 351 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 352 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 353 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 354 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 355 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 356 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 357 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 358 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 359 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 360 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 361 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 362 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 363 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 364 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 365 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 366 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 367 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 368 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 369 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 370 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 371 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 372 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 373 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 374 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 375 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 376 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 377 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 378 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 379 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 380 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 381 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 382 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 383 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 384 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 385 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 386 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 387 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 388 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 389 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 390 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 391 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 392 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 393 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 394 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 395 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 396 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 397 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 398 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 399 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 400 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 401 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 402 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 403 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 404 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 405 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 406 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 407 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 408 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 409 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 410 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 411 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 412 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 413 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 414 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 384 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 384 + LdsOffsetB_Blk: 1408 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 415 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 416 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 417 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 418 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 419 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 420 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 421 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 422 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 423 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 424 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 425 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 426 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 427 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 428 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 429 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 430 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 431 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 432 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 433 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 434 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 435 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 436 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 437 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 438 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 439 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 440 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 441 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 442 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 443 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 444 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 445 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 446 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 447 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 448 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 449 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 450 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 451 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 452 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 453 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 454 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 455 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 456 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 457 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 458 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 459 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 460 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 461 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 462 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 463 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 464 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 465 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 466 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 467 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 468 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 469 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 470 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 471 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 472 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 473 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 474 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 475 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 476 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 477 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 478 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 479 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 480 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 481 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 482 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 483 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 484 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id031 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 485 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 486 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 487 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 488 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 489 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 490 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 491 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 492 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id031 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 493 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 494 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 495 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 496 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 497 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 498 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 499 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 500 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 501 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 502 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 503 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 504 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 505 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 506 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 519 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 520 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 521 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 522 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 523 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 524 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 525 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 526 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 640 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 527 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 640 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 528 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 529 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 530 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 531 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 532 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 640 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 533 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 534 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 535 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 536 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 537 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 538 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 539 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 540 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 541 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 542 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 543 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 544 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 545 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 546 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 547 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 548 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 549 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 550 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 553 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 554 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 555 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 556 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 557 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 558 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 559 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 560 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 563 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 566 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25957,10 +100605,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -25973,25 +100621,26 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26005,10 +100654,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26016,19 +100665,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -26036,6 +100692,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26045,6 +100702,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26054,6 +100712,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26068,79 +100727,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 170 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26153,11 +100821,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26165,19 +100833,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -26185,6 +100858,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26194,6 +100868,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26203,6 +100878,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26217,96 +100893,275 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 171 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26314,19 +101169,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -26334,6 +101196,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26343,6 +101206,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26352,6 +101216,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26366,46 +101231,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 172 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -26422,29 +101295,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -26452,10 +101326,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26471,11 +101345,18 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -26483,6 +101364,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26492,6 +101374,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26501,6 +101384,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26515,79 +101399,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 173 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26600,11 +101493,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26612,19 +101505,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -26632,6 +101532,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26641,6 +101542,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26650,6 +101552,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26664,35 +101567,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 174 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id002 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -26702,9 +101613,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -26712,27 +101623,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 + LdsNumElements: 2048 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -26742,18 +101654,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26769,11 +101681,18 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -26781,6 +101700,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26790,6 +101710,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26799,6 +101720,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26813,47 +101735,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 175 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id004 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -26861,37 +101791,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 4 LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LVCA: 64 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -26900,9 +101831,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26911,18 +101842,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -26930,6 +101868,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26939,6 +101878,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26948,6 +101888,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26962,79 +101903,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 176 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 + LSCB: 64 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27047,11 +101997,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27059,19 +102009,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -27079,6 +102036,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27088,6 +102046,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27097,6 +102056,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27111,85 +102071,94 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 177 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -27197,10 +102166,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27208,19 +102177,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -27228,6 +102202,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27237,6 +102212,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27246,6 +102222,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27260,75 +102237,86 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 178 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -27345,11 +102333,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27357,19 +102345,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -27377,6 +102372,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27386,6 +102382,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27395,6 +102392,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27409,46 +102407,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 179 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -27465,29 +102471,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 2 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -27495,9 +102502,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -27506,19 +102513,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -27526,6 +102538,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27535,6 +102548,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27544,6 +102558,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27558,96 +102573,107 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 180 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27655,19 +102681,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -27675,6 +102708,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27684,6 +102718,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27693,6 +102728,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27707,71 +102743,80 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 181 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -27785,18 +102830,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27804,19 +102849,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -27824,6 +102876,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27833,6 +102886,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27842,6 +102896,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27856,48 +102911,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 182 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id004 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -27910,42 +102973,43 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27953,19 +103017,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -27973,6 +103044,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27982,6 +103054,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27991,6 +103064,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28005,35 +103079,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 183 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -28043,8 +103125,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -28061,19 +103143,20 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 4 + LSCB: 32 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -28083,18 +103166,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28104,17 +103187,24 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -28122,6 +103212,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28131,6 +103222,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28140,6 +103232,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28154,35 +103247,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 184 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id007 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -28192,41 +103293,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28240,10 +103342,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28251,19 +103353,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -28271,6 +103380,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28280,6 +103390,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28289,6 +103400,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28303,79 +103415,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 185 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 384 - LdsNumElementsAlignedB: 384 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 384 - LdsOffsetB_Blk: 1408 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28388,11 +103509,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28400,19 +103521,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -28420,6 +103548,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28429,6 +103558,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28438,6 +103568,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28452,48 +103583,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 186 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -28501,47 +103640,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28549,19 +103689,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -28569,6 +103714,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28578,6 +103724,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28587,6 +103734,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28601,48 +103749,58 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 187 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -28650,47 +103808,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28698,19 +103857,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -28718,6 +103882,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28727,6 +103892,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28736,6 +103902,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28750,39 +103917,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 188 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -28798,31 +103975,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28835,11 +104013,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28847,19 +104025,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -28867,6 +104052,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28876,6 +104062,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28885,6 +104072,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28899,39 +104087,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 189 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -28939,8 +104135,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -28948,30 +104144,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 + LSCA: 32 + LSCB: 64 + LSPA: 16 LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28984,11 +104181,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28996,19 +104193,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29016,6 +104220,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29025,6 +104230,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29034,6 +104240,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29048,33 +104255,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 190 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -29097,47 +104312,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29145,19 +104361,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29165,6 +104388,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29174,6 +104398,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29183,6 +104408,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29197,39 +104423,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 191 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29237,45 +104471,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -29283,10 +104518,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29294,19 +104529,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29314,6 +104556,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29323,6 +104566,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29332,6 +104576,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29346,39 +104591,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 192 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29386,35 +104639,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -29431,10 +104685,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -29445,17 +104699,24 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29463,6 +104724,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29472,6 +104734,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29481,6 +104744,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29495,39 +104759,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 193 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29535,35 +104807,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -29573,7 +104846,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -29581,9 +104854,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -29592,19 +104865,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29612,6 +104892,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29621,6 +104902,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29630,6 +104912,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29644,39 +104927,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 194 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29692,48 +104983,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29741,19 +105033,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29761,6 +105060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29770,6 +105070,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29779,6 +105080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29793,39 +105095,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 195 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29833,7 +105143,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -29841,48 +105151,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29890,19 +105201,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29910,6 +105228,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29919,6 +105238,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29928,6 +105248,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29942,39 +105263,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 196 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29982,7 +105311,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -29990,23 +105319,24 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 1792 LdsNumElementsAlignedA: 512 @@ -30020,7 +105350,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30028,10 +105358,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30039,19 +105369,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30059,6 +105396,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30068,6 +105406,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30077,6 +105416,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30091,39 +105431,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 197 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30131,35 +105479,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -30169,7 +105518,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30177,10 +105526,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30189,18 +105538,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30208,6 +105564,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30217,6 +105574,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30226,6 +105584,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30240,39 +105599,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 198 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30280,7 +105647,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -30288,31 +105655,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -30325,11 +105693,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30337,19 +105705,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30357,6 +105732,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30366,6 +105742,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30375,6 +105752,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30389,39 +105767,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 199 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30429,45 +105815,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30475,10 +105862,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30486,19 +105873,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30506,6 +105900,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30515,6 +105910,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30524,6 +105920,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30538,39 +105935,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 200 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30578,39 +105983,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 4 LSPB: 16 - LVCA: 16 - LVCB: 8 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -30623,10 +106029,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -30637,17 +106043,24 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30655,6 +106068,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30664,6 +106078,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30673,6 +106088,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30687,69 +106103,78 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 201 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -30765,7 +106190,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30773,10 +106198,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30784,19 +106209,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30804,6 +106234,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30813,6 +106244,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30822,6 +106254,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30836,14 +106269,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 202 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -30853,16 +106293,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -30875,9 +106318,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -30885,47 +106328,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30933,19 +106377,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30953,6 +106402,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30962,6 +106412,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30971,6 +106422,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30985,39 +106437,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 203 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31025,7 +106487,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -31033,31 +106495,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -31070,7 +106533,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -31083,18 +106546,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31102,6 +106572,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31111,6 +106582,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31120,6 +106592,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31134,39 +106607,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 204 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31174,7 +106655,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -31182,37 +106663,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31220,10 +106702,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31231,19 +106713,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31251,6 +106740,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31260,6 +106750,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31269,6 +106760,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31283,39 +106775,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 205 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id012 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31331,31 +106831,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 16 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -31368,11 +106869,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31380,19 +106881,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31400,6 +106908,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31409,6 +106918,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31418,6 +106928,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31432,39 +106943,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 206 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31472,8 +106991,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31481,26 +107000,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1792 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -31510,7 +107030,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31518,9 +107038,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -31530,18 +107050,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31549,6 +107076,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31558,6 +107086,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31567,6 +107096,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31581,39 +107111,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 207 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31629,48 +107167,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false - LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31678,19 +107217,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31698,6 +107244,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31707,6 +107254,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31716,6 +107264,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31730,96 +107279,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 208 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31827,19 +107385,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31847,6 +107410,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31856,6 +107420,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31865,6 +107430,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31879,39 +107445,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 209 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31920,44 +107496,45 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31965,10 +107542,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31976,19 +107553,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31996,6 +107580,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32005,6 +107590,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32014,6 +107600,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32028,33 +107615,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 210 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32077,30 +107672,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -32114,10 +107710,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32127,17 +107723,24 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32145,6 +107748,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32154,6 +107758,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32163,6 +107768,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32177,33 +107783,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 211 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32226,30 +107840,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 + LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 16 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -32264,9 +107879,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32274,19 +107889,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32294,6 +107916,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32303,6 +107926,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32312,6 +107936,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32326,39 +107951,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 212 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32366,35 +107999,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -32404,7 +108038,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -32412,10 +108046,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32423,19 +108057,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32443,6 +108084,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32452,6 +108094,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32461,53 +108104,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - SolutionIndex: 213 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32515,45 +108167,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -32562,9 +108215,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32572,19 +108225,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32592,6 +108252,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32601,6 +108262,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32610,6 +108272,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32624,14 +108287,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 214 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -32641,22 +108311,23 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32664,45 +108335,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -32711,9 +108383,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32721,19 +108393,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32741,6 +108420,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32750,6 +108430,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32759,6 +108440,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32773,33 +108455,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 215 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32812,7 +108502,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -32822,13 +108512,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 @@ -32872,17 +108563,22 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32890,6 +108586,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32899,6 +108596,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32908,6 +108606,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32922,14 +108621,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 216 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -32940,15 +108646,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id012 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32962,55 +108671,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 8 - LSPB: 32 + LSPB: 16 LVCA: 32 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -33019,19 +108729,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33039,6 +108756,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33048,6 +108766,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33057,6 +108776,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33071,33 +108791,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 217 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -33110,36 +108838,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -33149,18 +108878,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33169,18 +108898,23 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33188,6 +108922,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33197,6 +108932,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33206,6 +108942,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33220,96 +108957,107 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 218 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33317,19 +109065,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33337,6 +109090,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33346,6 +109100,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33355,6 +109110,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33369,33 +109125,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 219 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -33409,56 +109175,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33466,19 +109233,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33486,6 +109260,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33495,6 +109270,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33504,6 +109280,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33518,33 +109295,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 220 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -33567,26 +109352,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -33605,9 +109391,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33615,19 +109401,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33635,6 +109428,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33644,6 +109438,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33653,6 +109448,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33667,33 +109463,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 221 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -33707,29 +109511,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -33766,17 +109571,24 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33784,6 +109596,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33793,6 +109606,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33802,6 +109616,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33816,39 +109631,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 222 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33856,45 +109679,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -33903,9 +109727,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33913,19 +109737,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33933,6 +109764,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33942,6 +109774,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33951,6 +109784,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33965,39 +109799,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 223 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34006,7 +109848,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -34014,47 +109856,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 64 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 + LVCB: 32 LVPA: 4 - LVPB: 32 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -34062,19 +109905,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34082,6 +109932,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34091,6 +109942,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34100,6 +109952,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -34114,39 +109967,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 224 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34155,7 +110016,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -34163,36 +110024,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34200,9 +110062,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -34211,19 +110073,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34231,6 +110100,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34240,6 +110110,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34249,6 +110120,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -34263,17 +110135,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 225 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -34281,21 +110160,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34312,24 +110192,25 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 @@ -34341,7 +110222,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34349,10 +110230,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -34360,8 +110241,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -34369,10 +110250,17 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34380,6 +110268,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34389,6 +110278,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34398,6 +110288,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -34412,39 +110303,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 226 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34452,45 +110351,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34498,10 +110398,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -34509,19 +110409,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34529,6 +110436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34538,6 +110446,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34547,6 +110456,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -34561,33 +110471,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 227 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -34601,29 +110519,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -34659,18 +110578,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34678,6 +110604,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34687,6 +110614,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34696,6 +110624,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -34710,14 +110639,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 228 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -34727,31 +110663,32 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -34759,26 +110696,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -34788,7 +110726,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34796,10 +110734,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -34807,19 +110745,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34827,6 +110770,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34836,6 +110780,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34845,6 +110790,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -34859,39 +110805,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 229 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34899,56 +110855,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -34956,8 +110913,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -34965,10 +110922,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34976,6 +110940,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34985,6 +110950,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34994,6 +110960,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -35008,46 +110975,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 230 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -35064,15 +111039,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -35086,7 +111062,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -35094,10 +111070,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -35105,19 +111081,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35125,6 +111106,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35134,6 +111116,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35143,6 +111126,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -35157,85 +111141,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 231 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 LSCB: 64 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -35243,9 +111238,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -35254,19 +111249,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35274,6 +111274,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35283,6 +111284,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35292,6 +111294,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -35306,46 +111309,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 232 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -35362,15 +111375,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 @@ -35384,7 +111398,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -35392,10 +111406,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -35403,8 +111417,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -35412,10 +111426,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35423,6 +111444,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35432,6 +111454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35441,6 +111464,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -35455,95 +111479,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 233 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -35552,19 +111585,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35572,6 +111612,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35581,6 +111622,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35590,6 +111632,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -35604,96 +111647,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 234 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -35701,19 +111753,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35721,6 +111780,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35730,6 +111790,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35739,6 +111800,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -35753,85 +111815,94 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 235 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -35840,9 +111911,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -35850,19 +111921,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35870,6 +111948,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35879,6 +111958,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35888,6 +111968,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -35902,73 +111983,82 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 236 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 LSPA: 8 LSPB: 16 LVCA: 32 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -35980,18 +112070,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -35999,19 +112089,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36019,6 +112116,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36028,6 +112126,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36037,6 +112136,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -36051,33 +112151,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 237 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -36091,39 +112199,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -36137,37 +112246,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36177,6 +112296,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36186,47 +112306,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 238 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -36238,41 +112369,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -36286,37 +112414,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36326,6 +112464,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36335,53 +112474,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 239 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36389,39 +112539,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -36434,7 +112585,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -36442,23 +112593,32 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36466,6 +112626,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36475,6 +112636,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36484,53 +112646,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 240 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36538,39 +112711,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -36583,31 +112757,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36615,6 +112798,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36624,6 +112808,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36633,47 +112818,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 241 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -36685,41 +112881,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 16 + LVCA: 64 LVCB: 64 - LVPA: 8 - LVPB: 2 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -36733,37 +112926,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36773,6 +112976,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36782,47 +112986,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 242 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 699 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -36834,41 +113049,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -36882,37 +113094,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36922,6 +113144,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36931,53 +113154,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 243 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36985,39 +113219,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 16 + LSPB: 4 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37030,38 +113265,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37071,6 +113316,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37080,93 +113326,101 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 244 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37179,20 +113433,22 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -37200,17 +113456,25 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37220,6 +113484,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37229,53 +113494,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 245 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -37283,39 +113559,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37328,7 +113605,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -37336,30 +113613,40 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37369,6 +113656,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37378,93 +113666,101 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 246 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37477,38 +113773,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37518,6 +113824,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37527,53 +113834,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 247 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -37581,39 +113899,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 16 + LVCA: 64 LVCB: 64 - LVPA: 8 - LVPB: 2 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37626,31 +113945,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37658,6 +113986,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37667,6 +113996,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37676,47 +114006,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 248 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -37728,41 +114069,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37776,37 +114114,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37816,6 +114164,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37825,47 +114174,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 249 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -37879,39 +114239,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 16 + LSPB: 4 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37925,30 +114286,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37956,6 +114326,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37965,6 +114336,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37974,47 +114346,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 250 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -38026,41 +114409,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38074,37 +114454,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38114,6 +114504,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38123,62 +114514,73 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 251 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -38190,26 +114592,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38222,31 +114625,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38254,6 +114666,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38263,6 +114676,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38272,93 +114686,105 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 252 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id027 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 2 - LSPB: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38368,34 +114794,43 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38403,6 +114838,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38412,6 +114848,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38421,47 +114858,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 253 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id027 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -38475,76 +114923,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38552,6 +115010,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38561,6 +115020,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38570,53 +115030,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 254 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -38624,65 +115095,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -38690,10 +115164,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38701,6 +115182,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38710,6 +115192,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38719,53 +115202,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 255 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id031 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -38773,76 +115267,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38850,6 +115354,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38859,6 +115364,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38868,53 +115374,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 256 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -38922,39 +115439,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38967,31 +115485,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38999,6 +115526,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39008,6 +115536,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39017,47 +115546,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 257 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id030 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39070,9 +115610,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -39080,67 +115620,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39148,6 +115696,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39157,6 +115706,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39166,93 +115716,107 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 258 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -39265,31 +115829,38 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39297,6 +115868,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39306,6 +115878,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39315,47 +115888,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 259 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id030 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39377,68 +115963,78 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 32 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39446,6 +116042,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39455,6 +116052,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39464,47 +116062,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 260 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39526,68 +116135,78 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39595,6 +116214,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39604,6 +116224,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39613,47 +116234,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 261 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39666,9 +116298,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -39676,67 +116308,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 96 + LSCB: 64 + LSPA: 5 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39744,6 +116384,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39753,6 +116394,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39762,62 +116404,75 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 262 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -39825,67 +116480,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39893,6 +116556,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39902,6 +116566,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39911,46 +116576,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 263 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id031 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39962,7 +116641,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -39979,19 +116659,20 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 4 - LSPB: 2 + LSPB: 8 LVCA: 64 - LVCB: 128 + LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3328 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -40010,27 +116691,38 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -40038,6 +116730,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40047,6 +116740,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40056,47 +116750,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 264 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id035 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -40108,40 +116813,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40156,27 +116863,36 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -40184,6 +116900,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40193,6 +116910,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40202,88 +116920,107 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 265 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 + LSCA: 64 + LSCB: 96 LSPA: 8 - LSPB: 1 + LSPB: 5 LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40293,39 +117030,49 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40335,6 +117082,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40344,59 +117092,73 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 266 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -40413,19 +117175,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40435,39 +117202,51 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40477,6 +117256,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40486,27 +117266,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 267 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -40517,28 +117308,29 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -40555,19 +117347,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40577,39 +117374,51 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40619,6 +117428,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40628,27 +117438,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 268 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -40659,57 +117480,63 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 LVCB: 64 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40722,36 +117549,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40761,6 +117600,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40770,92 +117610,105 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 269 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40868,36 +117721,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40907,6 +117772,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40916,47 +117782,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 270 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -40968,40 +117845,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -41016,27 +117895,38 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41044,6 +117934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41053,6 +117944,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41062,27 +117954,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 271 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -41092,17 +117995,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -41114,40 +118017,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 8 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -41161,28 +118066,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41190,6 +118106,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41199,6 +118116,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41208,47 +118126,58 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 272 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -41260,7 +118189,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -41277,6 +118207,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 @@ -41310,28 +118241,40 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41341,6 +118284,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41350,92 +118294,105 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 273 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -41448,29 +118405,38 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41478,6 +118444,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41487,6 +118454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41496,88 +118464,107 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 274 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -41590,7 +118577,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -41598,28 +118585,38 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41629,6 +118626,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41638,27 +118636,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 275 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -41668,58 +118677,62 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -41736,36 +118749,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41775,6 +118800,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41784,88 +118810,105 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 276 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -41878,36 +118921,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41917,6 +118972,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41926,46 +118982,56 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 277 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id035 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -41979,37 +119045,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42023,24 +119094,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42050,13 +119126,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42076,15 +119154,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -42095,15 +119175,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 278 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -42111,21 +119191,19 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42140,40 +119218,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42187,24 +119266,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42214,13 +119298,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42240,15 +119326,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -42259,37 +119347,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 279 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42303,37 +119389,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 64 LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42347,24 +119438,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42374,13 +119468,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42400,15 +119496,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -42419,15 +119517,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 280 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -42439,9 +119537,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -42449,7 +119547,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42457,47 +119555,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42510,25 +119605,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42538,13 +119638,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42564,15 +119666,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -42583,37 +119687,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 281 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42628,36 +119730,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 LVCA: 32 - LVCB: 8 - LVPA: 1 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42671,24 +119774,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42698,6 +119806,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -42705,6 +119814,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42724,15 +119834,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -42743,37 +119855,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 282 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42781,16 +119891,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -42801,27 +119911,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42834,25 +119945,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42862,6 +119978,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -42869,6 +119986,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42888,15 +120006,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -42907,8 +120027,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 283 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -42916,28 +120036,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42951,68 +120069,78 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 LVCA: 32 - LVCB: 8 - LVPA: 1 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43022,13 +120150,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43048,15 +120178,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -43067,37 +120199,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 284 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43111,68 +120241,78 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43182,13 +120322,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43208,15 +120350,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -43227,14 +120371,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 285 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -43243,21 +120387,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43265,16 +120407,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -43285,54 +120427,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43342,13 +120494,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43368,15 +120522,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -43387,37 +120543,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 286 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43433,8 +120587,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -43445,58 +120599,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43506,6 +120664,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -43513,6 +120672,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43532,15 +120692,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -43551,29 +120713,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 287 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -43581,7 +120743,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43592,71 +120754,81 @@ DepthU: 16 DirectToLds: false DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43666,13 +120838,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43692,15 +120866,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -43711,8 +120887,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 288 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -43721,27 +120897,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43749,74 +120923,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43826,13 +121010,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43852,15 +121038,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -43871,37 +121059,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 289 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43909,74 +121095,82 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43986,13 +121180,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -44012,15 +121208,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -44031,37 +121229,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 290 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44076,8 +121274,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -44090,57 +121288,63 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 16 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44150,6 +121354,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -44157,6 +121362,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -44176,15 +121382,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -44195,16 +121403,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 291 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -44216,16 +121424,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44233,7 +121439,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -44253,33 +121459,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -44287,26 +121494,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44316,8 +121526,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44343,15 +121554,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -44362,35 +121575,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 292 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44398,15 +121611,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -44414,60 +121627,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44477,7 +121698,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -44504,15 +121726,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -44523,35 +121747,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 293 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44559,15 +121783,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -44575,60 +121799,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44638,7 +121870,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -44665,15 +121898,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -44684,35 +121919,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 294 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44720,7 +121955,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -44728,7 +121963,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -44736,64 +121971,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44803,8 +122042,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44830,15 +122070,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -44849,35 +122091,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 295 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44885,49 +122127,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -44936,25 +122183,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44964,8 +122212,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44991,15 +122240,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -45010,15 +122261,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 296 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -45026,19 +122277,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45066,10 +122319,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 @@ -45103,12 +122357,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -45116,6 +122372,7 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45125,6 +122382,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -45152,15 +122410,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -45171,8 +122431,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 297 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45191,7 +122451,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -45199,7 +122459,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45227,21 +122487,22 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 2 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 768 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -45258,25 +122519,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45286,8 +122550,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45313,15 +122578,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -45332,8 +122599,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 298 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45342,25 +122609,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45368,7 +122635,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -45376,35 +122643,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 16 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45417,27 +122685,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45447,6 +122718,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -45474,15 +122746,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -45493,8 +122767,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 299 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45503,25 +122777,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45549,19 +122823,20 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 2 + LSPB: 2 LVCA: 64 LVCB: 64 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -45590,19 +122865,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45612,6 +122890,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: @@ -45639,15 +122918,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -45658,35 +122939,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 300 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45714,27 +122995,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 + LSCB: 64 + LSPA: 2 LSPB: 2 LVCA: 64 - LVCB: 128 - LVPA: 4 + LVCB: 64 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45749,25 +123031,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45777,6 +123062,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: @@ -45804,15 +123090,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -45823,16 +123111,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 301 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -45843,15 +123131,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45865,7 +123153,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -45879,23 +123167,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 2 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45910,25 +123203,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45938,7 +123234,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -45965,15 +123262,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -45984,8 +123283,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 302 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45994,17 +123293,17 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -46012,7 +123311,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46026,37 +123325,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46071,25 +123375,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46099,8 +123406,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -46126,15 +123434,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -46145,8 +123455,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 303 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -46155,25 +123465,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46187,37 +123497,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 16 LVCB: 16 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46232,25 +123547,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46260,8 +123578,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -46287,15 +123606,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -46306,8 +123627,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 304 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -46316,17 +123637,17 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -46334,7 +123655,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46342,43 +123663,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46391,7 +123717,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -46399,19 +123725,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46421,7 +123748,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -46448,15 +123776,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -46467,35 +123797,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 305 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46503,43 +123835,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 16 LVCB: 16 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46552,27 +123889,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46582,8 +123920,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -46609,15 +123948,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -46628,8 +123969,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 306 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -46638,25 +123979,27 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46664,9 +124007,9 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr @@ -46684,19 +124027,20 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 1280 LdsOffsetA: 0 @@ -46709,23 +124053,25 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 2 @@ -46734,6 +124080,7 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46743,6 +124090,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -46770,15 +124118,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -46789,8 +124139,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 307 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -46799,25 +124149,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46825,7 +124175,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -46833,35 +124183,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 16 LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46874,27 +124225,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46904,6 +124258,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -46931,15 +124286,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -46950,35 +124307,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 308 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46992,37 +124349,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 + LSCA: 32 + LSCB: 64 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 16 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -47036,26 +124398,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47065,8 +124428,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -47092,15 +124456,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -47111,15 +124477,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 309 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -47127,19 +124493,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47147,41 +124515,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 + LSCA: 32 + LSCB: 64 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 16 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1536 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -47196,27 +124565,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47226,6 +124596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -47253,15 +124624,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -47272,15 +124645,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 310 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -47288,19 +124661,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47309,9 +124684,9 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -47332,19 +124707,20 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -47354,30 +124730,33 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47387,8 +124766,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -47414,15 +124794,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -47433,35 +124815,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 311 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47475,37 +124857,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -47520,25 +124907,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47548,8 +124938,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -47575,15 +124966,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -47594,35 +124987,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 312 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47654,6 +125047,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 @@ -47691,7 +125085,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -47704,6 +125100,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47713,8 +125110,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -47740,15 +125138,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -47759,8 +125159,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 313 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -47768,7 +125168,7 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -47781,13 +125181,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47795,14 +125195,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -47819,23 +125219,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -47848,7 +125249,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -47856,19 +125257,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47878,8 +125280,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -47905,15 +125308,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -47924,20 +125329,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 314 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -47945,14 +125350,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47960,7 +125367,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -47968,39 +125375,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48013,25 +125421,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48041,6 +125452,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48068,15 +125480,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -48087,37 +125501,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 315 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48125,14 +125539,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -48149,23 +125563,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 16 - LSPB: 8 + LSPA: 8 + LSPB: 4 LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48178,7 +125593,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -48186,19 +125601,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48208,8 +125624,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -48235,15 +125652,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -48254,20 +125673,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 316 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -48275,14 +125694,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48290,7 +125711,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -48298,39 +125719,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48343,27 +125765,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48373,8 +125798,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -48400,27 +125826,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 317 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48428,14 +125858,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -48443,11 +125873,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48455,47 +125885,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48508,25 +125939,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48536,8 +125972,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -48563,27 +126000,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 318 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48591,28 +126032,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48626,41 +126065,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48674,26 +126110,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48703,7 +126142,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -48730,27 +126170,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 319 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48758,14 +126202,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -48777,7 +126221,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48792,40 +126236,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48839,24 +126284,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48866,6 +126316,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48893,27 +126344,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 320 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48921,28 +126376,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48950,7 +126403,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -48958,39 +126411,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49003,27 +126457,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49033,6 +126490,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49060,27 +126518,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 321 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -49088,26 +126550,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49115,7 +126577,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -49123,39 +126585,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49168,7 +126631,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -49176,19 +126639,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49198,8 +126664,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -49225,27 +126692,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 322 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -49253,26 +126724,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49280,47 +126751,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49333,7 +126805,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -49341,17 +126813,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49361,8 +126838,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -49388,27 +126866,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 323 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -49416,14 +126898,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -49431,13 +126913,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49445,47 +126925,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 4 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49498,27 +126979,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49528,8 +127010,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -49555,27 +127038,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 324 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -49583,26 +127070,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49610,7 +127099,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -49618,39 +127107,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 4 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49663,25 +127153,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49691,6 +127184,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49718,27 +127212,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 325 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -49746,28 +127244,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49775,47 +127273,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49828,27 +127327,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49858,6 +127358,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49885,27 +127386,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 326 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -49913,26 +127418,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49946,41 +127453,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49995,23 +127499,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50021,7 +127530,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -50048,27 +127558,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 327 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -50076,24 +127590,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -50105,14 +127617,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -50133,20 +127645,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50159,7 +127671,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -50167,17 +127679,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50215,52 +127732,54 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 328 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -50280,40 +127799,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50328,25 +127847,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50358,7 +127880,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -50384,34 +127906,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 329 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 @@ -50423,8 +127949,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -50440,12 +127966,12 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -50466,21 +127992,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 2 + LSCB: 32 + LSPA: 1 LSPB: 4 LVCA: 128 - LVCB: 64 - LVPA: 2 + LVCB: 32 + LVPA: 1 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50489,31 +128011,34 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50524,8 +128049,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -50551,34 +128076,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 330 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -50591,7 +128120,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -50613,41 +128142,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 - LVCB: 32 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -50661,24 +128190,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50690,7 +128224,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -50716,33 +128250,209 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 331 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -50755,9 +128465,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -50897,8 +128607,6 @@ - [83, 6071.16] - - [6784, 6784, 1, 1280] - [80, 9535.64] - - - [1024, 256, 1, 3328] - - [74, 5742.58] - - [1408, 4288, 1, 1280] - [83, 8254.99] - - [3584, 4288, 1, 1280] @@ -52377,8 +130085,6 @@ - [56, 5129.81] - - [2368, 3584, 1, 256] - [74, 8998.7] - - - [1024, 256, 1, 1280] - - [81, 3566.58] - - [5056, 3584, 1, 1280] - [75, 9345.07] - - [448, 4, 1, 3328] @@ -52839,8 +130545,6 @@ - [100, 9061.26] - - [49, 2048, 128, 512] - [98, 6963.26] - - - [784, 512, 64, 128] - - [100, 8822.52] - - [784, 128, 128, 512] - [107, 8983.53] - - [196, 256, 64, 1024] @@ -52851,36 +130555,22 @@ - [99, 8581.25] - - [49, 2048, 256, 512] - [98, 7049.54] - - - [196, 1024, 64, 256] - - [101, 7953.59] - - [784, 128, 256, 512] - [109, 9102.89] - - [196, 256, 128, 1024] - [101, 8085.79] - - - [3136, 64, 64, 256] - - [105, 9266.03] - - - [784, 128, 64, 512] - - [106, 8809.29] - - - [49, 2048, 64, 512] - - [98, 6843.85] - - [3136, 64, 128, 256] - [105, 9381.29] - - [3136, 256, 128, 64] - [103, 8982.54] - - [784, 512, 128, 128] - [100, 8965.89] - - - [3136, 256, 64, 64] - - [103, 8879.7] - - [3136, 64, 256, 256] - [105, 9566.33] - - - [3136, 64, 64, 64] - - [104, 8313.95] - - [3136, 64, 256, 64] - [99, 8743.7] - - [196, 1024, 128, 256] - [102, 8119.33] - - - [49, 512, 64, 2048] - - [110, 7055.31] - - [49, 512, 256, 2048] - [111, 7166.31] - - [196, 1024, 256, 256] @@ -54871,4112 +132561,6364 @@ - [162, 5765.37] - - [4096, 3072, 1, 128] - [164, 8869.01] + - - [768, 3072, 1, 4096] + - [176, 10028.7] + - - [64, 256, 192, 256] + - [170, 8791.55] + - - [768, 2, 1, 16] + - [173, 4.95484] + - - [768, 768, 1, 64] + - [169, 3469.55] + - - [768, 768, 1, 4096] + - [177, 7475.0] + - - [768, 30522, 1, 1280] + - [180, 10296.9] + - - [64, 128, 384, 128] + - [170, 7660.83] + - - [768, 30522, 1, 320] + - [178, 10007.9] + - - [768, 768, 1, 32] + - [167, 2359.3] + - - [3072, 768, 1, 4096] + - [176, 10033.7] + - - [768, 30522, 1, 640] + - [179, 10206.7] + - - [64, 64, 768, 64] + - [168, 5494.72] + - - [768, 768, 1, 640] + - [177, 6721.64] + - - [768, 768, 1, 16] + - [166, 1203.72] + - - [768, 768, 1, 1280] + - [175, 7138.57] + - - [768, 2, 1, 32] + - [171, 11.8154] + - - [2048, 2048, 1, 512] + - [191, 9607.57] + - - [512, 32, 1, 200] + - [184, 422.268] + - - [1024, 1, 1, 200] + - [187, 24.6154] + - - [1600, 1024, 1, 512] + - [182, 8115.91] + - - [560, 1024, 1, 200] + - [181, 4810.74] + - - [1024, 1024, 1, 512] + - [190, 8614.74] + - - [2048, 1, 1, 512] + - [185, 80.9086] + - - [512, 512, 1, 200] + - [183, 4398.39] + - - [100, 2048, 1, 512] + - [188, 4443.12] + - - [1024, 1024, 1, 200] + - [189, 6990.51] + - - [1024, 64, 1, 512] + - [186, 2853.27] + - - [1024, 256, 1, 18944] + - [210, 9196.41] + - - [256, 3328, 1, 8976] + - [200, 8299.26] + - - [1024, 256, 1, 4352] + - [208, 8813.74] + - - [256, 9728, 1, 8976] + - [203, 9638.48] + - - [1024, 256, 1, 3072] + - [210, 8640.63] + - - [768, 2048, 1, 256] + - [202, 8662.93] + - - [1024, 256, 1, 19968] + - [207, 9220.86] + - - [256, 12800, 1, 8976] + - [197, 9418.42] + - - [1024, 256, 1, 3328] + - [211, 8682.48] + - - [256, 10240, 1, 8976] + - [204, 10137.7] + - - [1024, 256, 1, 15104] + - [209, 9167.03] + - - [256, 10496, 1, 8976] + - [197, 9858.38] + - - [1024, 256, 1, 2816] + - [212, 8575.71] + - - [1024, 256, 1, 4608] + - [207, 8861.21] + - - [256, 11264, 1, 8976] + - [194, 9627.69] + - - [1024, 256, 1, 6400] + - [207, 8985.23] + - - [1024, 256, 1, 16128] + - [207, 9170.26] + - - [256, 44505, 1, 8976] + - [201, 10331.8] + - - [256, 6144, 1, 8976] + - [204, 10395.0] + - - [1024, 256, 1, 5120] + - [209, 8881.53] + - - [1024, 256, 1, 7936] + - [212, 9023.14] + - - [256, 3840, 1, 8976] + - [199, 9541.28] + - - [1024, 256, 1, 21248] + - [207, 9209.72] + - - [1024, 256, 1, 12032] + - [209, 9156.17] + - - [256, 8192, 1, 8976] + - [206, 10374.4] + - - [1024, 256, 1, 3584] + - [208, 8712.2] + - - [1024, 256, 1, 14336] + - [209, 9162.51] + - - [256, 7168, 1, 8976] + - [195, 9554.86] + - - [1024, 256, 1, 13568] + - [207, 9165.04] + - - [256, 4096, 1, 8976] + - [199, 10146.6] + - - [1024, 256, 1, 4096] + - [208, 8783.88] + - - [256, 2560, 1, 8976] + - [198, 8381.56] + - - [256, 20992, 1, 8976] + - [197, 9989.86] + - - [256, 4352, 1, 8976] + - [198, 9634.92] + - - [256, 33536, 1, 8976] + - [197, 10218.1] + - - [256, 3584, 1, 8976] + - [199, 8924.5] + - - [256, 26112, 1, 8976] + - [198, 10272.3] + - - [256, 14336, 1, 8976] + - [202, 10217.3] + - - [1024, 256, 1, 14848] + - [209, 9185.19] + - - [1024, 256, 1, 8448] + - [210, 9025.89] + - - [1024, 256, 1, 28672] + - [207, 9256.4] + - - [1024, 256, 1, 5632] + - [207, 8932.69] + - - [256, 22016, 1, 8976] + - [202, 10151.9] + - - [1024, 256, 1, 33536] + - [207, 9243.07] + - - [256, 5120, 1, 8976] + - [193, 9418.05] + - - [256, 11520, 1, 8976] + - [200, 9701.0] + - - [256, 19968, 1, 8976] + - [198, 10228.0] + - - [1024, 256, 1, 5376] + - [209, 8892.52] + - - [1024, 256, 1, 22016] + - [207, 9244.24] + - - [256, 8960, 1, 8976] + - [198, 9841.31] + - - [1024, 256, 1, 15872] + - [207, 9223.15] + - - [256, 17408, 1, 8976] + - [202, 9785.77] + - - [256, 5632, 1, 8976] + - [202, 9564.22] + - - [256, 32512, 1, 8976] + - [201, 10357.9] + - - [256, 11008, 1, 8976] + - [194, 9445.13] + - - [1024, 256, 1, 6144] + - [209, 8955.81] + - - [256, 4864, 1, 8976] + - [194, 8979.35] + - - [256, 15104, 1, 8976] + - [197, 10007.0] + - - [1024, 256, 1, 9984] + - [207, 9110.43] + - - [256, 1280, 1, 8976] + - [193, 5944.34] + - - [1024, 256, 1, 1024] + - [209, 7005.1] + - - [1024, 256, 1, 9728] + - [209, 9066.19] + - - [1024, 256, 1, 10496] + - [207, 9118.05] + - - [256, 11776, 1, 8976] + - [204, 9911.64] + - - [256, 12544, 1, 8976] + - [197, 9235.25] + - - [1024, 256, 1, 17152] + - [207, 9152.21] + - - [1024, 256, 1, 11520] + - [209, 9146.77] + - - [1024, 256, 1, 21504] + - [209, 9207.42] + - - [256, 17152, 1, 8976] + - [196, 9654.71] + - - [1024, 256, 1, 17408] + - [207, 9181.17] + - - [256, 15872, 1, 8976] + - [205, 10086.4] + - - [256, 18688, 1, 8976] + - [198, 9612.47] + - - [256, 5888, 1, 8976] + - [202, 9988.33] + - - [512, 2048, 1, 256] + - [192, 7678.36] + - - [1024, 256, 1, 7680] + - [210, 9032.96] + - - [1024, 256, 1, 1280] + - [212, 7767.23] + - - [256, 14848, 1, 8976] + - [198, 9852.66] + - - [256, 9984, 1, 8976] + - [204, 9908.87] + - - [256, 20480, 1, 8976] + - [202, 10337.1] + - - [1024, 256, 1, 8192] + - [209, 9044.32] + - - [1024, 256, 1, 19712] + - [208, 9184.18] + - - [256, 13568, 1, 8976] + - [198, 9927.82] + - - [256, 13312, 1, 8976] + - [197, 9757.91] + - - [256, 2816, 1, 8976] + - [197, 9191.43] + - - [1024, 256, 1, 2304] + - [208, 8444.91] + - - [256, 21248, 1, 8976] + - [198, 10127.5] + - - [256, 16128, 1, 8976] + - [206, 10238.4] + - - [256, 512, 36, 98] + - [229, 7994.85] + - - [64, 192, 36, 25088] + - [298, 8613.89] + - - [128, 128, 64, 25] + - [228, 2540.15] + - - [256, 256, 64, 56] + - [229, 6924.56] + - - [512, 486, 36, 800] + - [236, 8994.84] + - - [512, 512, 36, 1568] + - [247, 9872.38] + - - [64, 192, 64, 3200] + - [292, 9295.89] + - - [256, 384, 36, 4096] + - [292, 9334.61] + - - [128, 256, 64, 32] + - [231, 4279.9] + - - [64, 128, 64, 23104] + - [298, 10103.1] + - - [128, 256, 64, 9] + - [222, 1709.63] + - - [256, 512, 36, 784] + - [232, 9520.73] + - - [256, 324, 36, 32] + - [270, 4473.38] + - - [512, 512, 36, 33] + - [241, 5925.17] + - - [16, 32, 36, 5760] + - [245, 1448.8] + - - [192, 384, 64, 128] + - [292, 8618.43] + - - [512, 512, 64, 72] + - [248, 8260.12] + - - [128, 128, 64, 1600] + - [221, 9008.38] + - - [512, 512, 36, 128] + - [292, 8871.62] + - - [192, 384, 64, 2304] + - [221, 9657.16] + - - [384, 256, 64, 450] + - [257, 9538.93] + - - [3, 64, 36, 6272] + - [245, 509.784] + - - [3, 64, 64, 2888] + - [274, 708.621] + - - [384, 256, 64, 2304] + - [257, 10287.5] + - - [512, 512, 64, 144] + - [292, 9226.7] + - - [256, 256, 36, 6272] + - [232, 9607.28] + - - [80, 192, 64, 4608] + - [293, 7347.93] + - - [64, 64, 36, 3136] + - [280, 5959.05] + - - [256, 384, 64, 2304] + - [257, 10283.4] + - - [512, 512, 36, 66] + - [241, 7618.08] + - - [128, 256, 64, 800] + - [267, 9611.15] + - - [64, 128, 36, 30] + - [223, 1242.61] + - - [192, 256, 36, 512] + - [292, 8657.97] + - - [256, 512, 64, 200] + - [292, 9153.87] + - - [256, 512, 64, 25] + - [270, 5349.88] + - - [3, 64, 64, 46208] + - [273, 808.562] + - - [128, 256, 36, 1568] + - [265, 8528.62] + - - [64, 128, 64, 11552] + - [298, 9997.0] + - - [128, 192, 64, 946] + - [292, 9198.38] + - - [64, 192, 64, 12800] + - [253, 9000.66] + - - [224, 224, 64, 128] + - [230, 6312.07] + - - [128, 256, 64, 288] + - [292, 8697.87] + - - [64, 64, 64, 826] + - [235, 6650.21] + - - [256, 384, 64, 1152] + - [267, 10106.8] + - - [3, 64, 64, 92416] + - [273, 812.031] + - - [32, 32, 36, 43808] + - [214, 2813.09] + - - [160, 320, 64, 288] + - [224, 8090.86] + - - [1, 16, 36, 23040] + - [261, 42.6667] + - - [128, 256, 36, 128] + - [239, 6049.48] + - - [128, 128, 64, 3360] + - [292, 9199.96] + - - [128, 128, 64, 420] + - [292, 8131.5] + - - [64, 128, 64, 361] + - [229, 6937.98] + - - [512, 512, 36, 16] + - [285, 3797.66] + - - [384, 256, 36, 800] + - [226, 9151.65] + - - [192, 384, 36, 4096] + - [226, 8867.57] + - - [64, 64, 64, 1600] + - [278, 7931.74] + - - [256, 384, 64, 576] + - [258, 9745.8] + - - [512, 512, 64, 14] + - [241, 3638.18] + - - [512, 512, 36, 8] + - [216, 2279.51] + - - [512, 486, 64, 128] + - [232, 8337.83] + - - [1, 16, 64, 640] + - [266, 49.9512] + - - [64, 96, 64, 288] + - [291, 5707.97] + - - [96, 96, 36, 1568] + - [260, 6866.75] + - - [256, 256, 36, 128] + - [264, 7703.82] + - - [64, 128, 36, 53824] + - [252, 6331.31] + - - [256, 256, 36, 32] + - [248, 4648.86] + - - [192, 256, 64, 288] + - [292, 8987.79] + - - [256, 256, 36, 16] + - [262, 2912.71] + - - [128, 256, 36, 3200] + - [265, 8680.27] + - - [160, 320, 64, 512] + - [224, 8449.44] + - - [128, 160, 36, 512] + - [235, 7214.97] + - - [96, 96, 36, 2592] + - [230, 7104.79] + - - [64, 96, 64, 800] + - [260, 7268.32] + - - [147, 64, 36, 18816] + - [276, 7116.26] + - - [160, 320, 36, 512] + - [230, 7874.82] + - - [256, 512, 36, 4] + - [269, 1034.78] + - - [96, 128, 64, 946] + - [252, 7901.07] + - - [256, 324, 64, 1568] + - [257, 8589.53] + - - [128, 128, 64, 50] + - [248, 4070.56] + - - [35, 96, 36, 8960] + - [242, 4207.3] + - - [32, 64, 36, 43808] + - [283, 4390.81] + - - [160, 224, 36, 128] + - [230, 5446.92] + - - [64, 64, 64, 81] + - [255, 2391.18] + - - [256, 256, 36, 3200] + - [221, 9559.55] + - - [256, 256, 36, 210] + - [232, 8414.61] + - - [192, 384, 64, 576] + - [292, 9468.75] + - - [512, 512, 64, 800] + - [267, 10096.4] + - - [512, 24, 36, 800] + - [218, 4761.77] + - - [64, 64, 64, 13216] + - [279, 8491.41] + - - [192, 224, 64, 1152] + - [235, 8769.06] + - - [256, 256, 64, 1152] + - [257, 9988.09] + - - [512, 486, 64, 512] + - [267, 9254.67] + - - [128, 128, 36, 784] + - [230, 7468.06] + - - [256, 512, 64, 1600] + - [254, 10232.5] + - - [512, 512, 64, 9] + - [248, 2599.78] + - - [96, 128, 64, 288] + - [260, 6599.43] + - - [64, 96, 36, 512] + - [260, 5073.75] + - - [256, 512, 36, 1568] + - [292, 9637.81] + - - [128, 128, 64, 400] + - [292, 8192.0] + - - [128, 128, 64, 800] + - [292, 8716.34] + - - [96, 128, 36, 512] + - [280, 6756.93] + - - [16, 32, 36, 360] + - [243, 754.036] + - - [128, 256, 64, 3200] + - [257, 10222.5] + - - [96, 128, 64, 800] + - [260, 7967.9] + - - [256, 512, 64, 4] + - [222, 1097.99] + - - [256, 256, 64, 450] + - [267, 9347.45] + - - [64, 64, 64, 3200] + - [278, 8518.08] + - - [192, 224, 64, 128] + - [238, 7035.17] + - - [128, 128, 64, 288] + - [292, 7751.28] + - - [256, 256, 64, 72] + - [248, 7489.83] + - - [96, 208, 36, 512] + - [260, 6939.11] + - - [128, 256, 36, 3136] + - [235, 8669.33] + - - [64, 64, 36, 3520] + - [230, 6007.47] + - - [64, 128, 36, 1568] + - [293, 6897.7] + - - [160, 320, 64, 242] + - [219, 7873.17] + - - [192, 192, 36, 512] + - [230, 7707.32] + - - [512, 512, 36, 512] + - [292, 9582.42] + - - [1, 16, 64, 10240] + - [244, 71.3511] + - - [128, 128, 36, 512] + - [230, 7149.38] + - - [512, 512, 36, 256] + - [221, 9384.4] + - - [512, 512, 36, 1024] + - [215, 9777.89] + - - [96, 208, 64, 1152] + - [293, 7850.9] + - - [128, 192, 64, 3200] + - [221, 9490.82] + - - [256, 256, 36, 4096] + - [226, 9585.46] + - - [160, 160, 64, 288] + - [260, 7299.8] + - - [256, 256, 64, 896] + - [257, 9850.33] + - - [128, 256, 64, 242] + - [292, 8391.38] + - - [128, 128, 36, 440] + - [235, 6274.72] + - - [96, 128, 36, 1568] + - [280, 7875.03] + - - [192, 384, 36, 1024] + - [226, 8715.72] + - - [64, 96, 36, 10368] + - [297, 7478.59] + - - [128, 256, 64, 100] + - [241, 7084.97] + - - [112, 224, 36, 2048] + - [234, 7555.92] + - - [384, 256, 64, 1152] + - [257, 10102.3] + - - [192, 384, 36, 128] + - [292, 7543.04] + - - [128, 128, 36, 7040] + - [265, 7600.6] + - - [128, 256, 64, 1568] + - [257, 10005.9] + - - [128, 128, 36, 1568] + - [249, 7848.3] + - - [128, 256, 64, 72] + - [272, 6553.6] + - - [256, 256, 36, 12544] + - [286, 9365.04] + - - [256, 256, 36, 105] + - [248, 7286.06] + - - [128, 256, 36, 392] + - [235, 7625.69] + - - [64, 64, 64, 5408] + - [278, 8882.67] + - - [3, 64, 36, 25088] + - [245, 528.942] + - - [384, 256, 36, 1024] + - [292, 9182.75] + - - [35, 96, 36, 13440] + - [299, 4110.29] + - - [128, 256, 64, 1152] + - [257, 9804.87] + - - [256, 324, 64, 32] + - [270, 5043.63] + - - [160, 224, 64, 128] + - [284, 6046.15] + - - [192, 224, 36, 2592] + - [282, 8878.68] + - - [96, 96, 64, 1152] + - [260, 8035.45] + - - [32, 64, 36, 90] + - [217, 964.465] + - - [64, 128, 64, 2888] + - [232, 9047.23] + - - [256, 384, 36, 800] + - [292, 9154.02] + - - [512, 512, 64, 4] + - [289, 1233.62] + - - [192, 320, 36, 128] + - [229, 7388.19] + - - [64, 128, 36, 480] + - [293, 5653.27] + - - [192, 384, 64, 242] + - [292, 9079.99] + - - [256, 486, 64, 32] + - [285, 5909.18] + - - [147, 64, 64, 9702] + - [294, 7319.69] + - - [512, 512, 64, 64] + - [228, 8179.02] + - - [64, 192, 64, 3698] + - [221, 9287.89] + - - [73, 192, 64, 10439] + - [252, 6668.02] + - - [1, 16, 36, 1440] + - [268, 33.4452] + - - [128, 256, 36, 512] + - [235, 7989.15] + - - [512, 512, 64, 576] + - [267, 9951.89] + - - [64, 64, 36, 12544] + - [283, 5872.77] + - - [128, 128, 36, 880] + - [280, 7597.26] + - - [192, 224, 36, 128] + - [238, 6451.2] + - - [64, 64, 64, 800] + - [278, 6916.73] + - - [64, 128, 36, 12544] + - [256, 6395.88] + - - [64, 64, 36, 1568] + - [230, 5536.66] + - - [160, 160, 36, 512] + - [230, 7345.26] + - - [512, 24, 64, 512] + - [220, 5242.88] + - - [3, 64, 36, 3136] + - [245, 475.352] + - - [256, 256, 64, 9] + - [270, 2106.51] + - - [3, 64, 64, 11552] + - [273, 785.127] + - - [128, 256, 36, 12544] + - [288, 8792.13] + - - [128, 128, 36, 3136] + - [249, 8098.46] + - - [256, 512, 36, 3136] + - [232, 9694.39] + - - [64, 64, 36, 196] + - [246, 2757.76] + - - [144, 288, 36, 512] + - [280, 7077.89] + - - [256, 24, 64, 32] + - [259, 1483.83] + - - [384, 384, 36, 800] + - [221, 9246.5] + - - [512, 512, 64, 1600] + - [267, 10277.3] + - - [112, 224, 36, 512] + - [235, 6744.78] + - - [128, 128, 36, 49] + - [241, 2716.29] + - - [512, 512, 36, 4] + - [269, 1156.52] + - - [35, 96, 64, 4235] + - [230, 4631.28] + - - [192, 384, 64, 450] + - [221, 9372.2] + - - [256, 256, 36, 1024] + - [292, 9346.64] + - - [112, 224, 64, 1152] + - [235, 7523.95] + - - [256, 512, 64, 400] + - [254, 9597.95] + - - [149, 32, 36, 19072] + - [299, 5811.8] + - - [128, 256, 36, 6272] + - [235, 8754.68] + - - [128, 192, 36, 1568] + - [260, 8195.1] + - - [256, 256, 36, 512] + - [292, 9074.22] + - - [256, 256, 64, 112] + - [292, 8305.55] + - - [512, 512, 64, 18] + - [285, 4324.02] + - - [256, 256, 64, 18] + - [248, 3547.81] + - - [256, 256, 64, 1568] + - [257, 10141.7] + - - [64, 96, 36, 1568] + - [278, 6805.66] + - - [384, 256, 36, 4096] + - [292, 9311.1] + - - [256, 512, 64, 800] + - [267, 9998.35] + - - [256, 384, 36, 2048] + - [292, 9285.34] + - - [3, 64, 36, 200704] + - [274, 547.375] + - - [384, 384, 64, 2304] + - [215, 9901.68] + - - [160, 320, 64, 128] + - [251, 7113.81] + - - [512, 512, 36, 528] + - [221, 9567.65] + - - [160, 320, 36, 128] + - [252, 6411.13] + - - [96, 96, 64, 800] + - [260, 7690.01] + - - [256, 512, 36, 49] + - [248, 6721.25] + - - [384, 384, 64, 450] + - [221, 9523.53] + - - [3, 64, 64, 23104] + - [273, 801.621] + - - [256, 256, 64, 3200] + - [257, 10300.4] + - - [128, 192, 36, 512] + - [235, 7499.75] + - - [192, 192, 64, 288] + - [292, 8774.24] + - - [96, 208, 64, 242] + - [252, 5901.99] + - - [256, 16, 36, 3200] + - [281, 3807.77] + - - [512, 512, 64, 8] + - [259, 2379.75] + - - [64, 128, 64, 5776] + - [232, 9332.74] + - - [512, 512, 64, 288] + - [221, 9521.99] + - - [256, 16, 36, 32] + - [277, 766.005] + - - [128, 192, 64, 288] + - [292, 8527.58] + - - [32, 64, 64, 640] + - [260, 4660.34] + - - [64, 64, 36, 392] + - [260, 3686.4] + - - [384, 384, 36, 1024] + - [226, 9282.48] + - - [64, 64, 36, 11552] + - [290, 5904.78] + - - [96, 128, 36, 6272] + - [280, 8350.99] + - - [128, 256, 36, 16] + - [262, 2144.81] + - - [256, 256, 64, 288] + - [292, 9140.13] + - - [64, 64, 64, 1652] + - [278, 7766.53] + - - [256, 384, 36, 1024] + - [226, 9203.27] + - - [96, 128, 64, 3200] + - [295, 8866.2] + - - [256, 324, 36, 3200] + - [234, 8194.25] + - - [128, 192, 64, 800] + - [292, 9198.03] + - - [64, 128, 64, 10] + - [233, 851.117] + - - [96, 208, 64, 288] + - [260, 6667.58] + - - [64, 96, 36, 2592] + - [242, 7216.88] + - - [64, 128, 64, 160] + - [271, 5190.97] + - - [192, 384, 64, 512] + - [221, 9446.04] + - - [64, 64, 36, 6272] + - [230, 6212.01] + - - [512, 24, 36, 288] + - [227, 3922.47] + - - [128, 128, 64, 1568] + - [221, 9037.86] + - - [112, 224, 64, 242] + - [291, 6399.26] + - - [128, 256, 64, 1600] + - [257, 10010.3] + - - [32, 32, 64, 20000] + - [225, 4378.41] + - - [160, 192, 64, 288] + - [252, 7803.63] + - - [512, 24, 64, 128] + - [213, 3733.8] + - - [512, 512, 36, 32] + - [248, 5935.34] + - - [3, 64, 36, 100352] + - [245, 542.783] + - - [3, 64, 64, 1444] + - [274, 674.159] + - - [512, 512, 36, 3136] + - [215, 9921.1] + - - [128, 256, 64, 6400] + - [275, 10349.3] + - - [256, 256, 36, 2048] + - [292, 9518.99] + - - [128, 160, 64, 288] + - [235, 7549.75] + - - [256, 256, 64, 6400] + - [257, 10392.6] + - - [32, 64, 64, 20000] + - [283, 6493.86] + - - [256, 256, 36, 1680] + - [232, 9513.29] + - - [128, 128, 64, 210] + - [292, 7094.1] + - - [192, 384, 36, 2048] + - [221, 8818.65] + - - [256, 256, 64, 144] + - [292, 8608.61] + - - [384, 384, 36, 4096] + - [226, 9356.94] + - - [160, 320, 64, 1152] + - [252, 8749.48] + - - [384, 256, 36, 2048] + - [292, 9279.63] + - - [256, 512, 36, 392] + - [292, 9252.14] + - - [256, 512, 64, 50] + - [248, 7511.29] + - - [73, 192, 36, 23360] + - [296, 5802.93] + - - [3, 64, 36, 50176] + - [245, 542.037] + - - [384, 384, 36, 2048] + - [221, 9325.8] + - - [256, 384, 64, 450] + - [267, 9528.66] + - - [192, 320, 64, 128] + - [226, 8399.81] + - - [128, 256, 36, 32] + - [241, 3276.8] + - - [160, 192, 36, 512] + - [280, 7752.34] + - - [512, 512, 64, 256] + - [232, 9473.64] + - - [256, 512, 64, 32] + - [270, 6391.32] + - - [384, 384, 64, 576] + - [221, 9614.79] + - - [64, 64, 64, 648] + - [278, 6282.15] + - - [512, 486, 36, 288] + - [292, 8624.93] + - - [32, 64, 36, 1440] + - [230, 3961.5] + - - [144, 288, 64, 242] + - [252, 6347.02] + - - [384, 256, 64, 576] + - [257, 9775.24] + - - [512, 512, 36, 64] + - [228, 7791.28] + - - [448, 384, 64, 128] + - [221, 9132.23] + - - [64, 128, 64, 722] + - [271, 8047.11] + - - [144, 288, 64, 288] + - [280, 6859.4] + - - [512, 512, 64, 224] + - [292, 9427.29] + - - [112, 224, 64, 288] + - [291, 6736.92] + - - [384, 384, 64, 1152] + - [215, 9820.46] + - - [448, 384, 36, 128] + - [292, 8761.31] + - - [64, 64, 64, 100] + - [238, 2708.1] + - - [256, 486, 36, 128] + - [264, 7640.04] + - - [64, 96, 64, 4608] + - [293, 8351.49] + - - [16, 32, 64, 160] + - [217, 736.36] + - - [64, 192, 36, 6272] + - [293, 8041.19] + - - [64, 64, 64, 200] + - [246, 3924.31] + - - [256, 256, 36, 800] + - [292, 9299.55] + - - [64, 128, 36, 6272] + - [290, 6816.36] + - - [32, 64, 64, 40] + - [237, 885.622] + - - [256, 16, 64, 32] + - [287, 1205.26] + - - [192, 384, 36, 800] + - [226, 8673.88] + - - [128, 128, 36, 3200] + - [260, 8538.89] + - - [256, 256, 36, 256] + - [232, 8454.36] + - - [192, 384, 64, 1152] + - [221, 9589.01] + - - [128, 256, 64, 200] + - [231, 8141.12] + - - [64, 96, 64, 1152] + - [260, 7620.88] + - - [128, 128, 36, 392] + - [235, 6175.51] + - - [80, 192, 36, 10368] + - [283, 6497.16] + - - [224, 224, 36, 128] + - [293, 5826.89] + - - [512, 512, 64, 28] + - [248, 5728.81] + - - [256, 16, 64, 1568] + - [263, 4637.2] + - - [144, 288, 64, 1152] + - [280, 7784.24] + - - [256, 256, 64, 576] + - [257, 9596.12] + - - [64, 128, 36, 784] + - [293, 6058.99] + - - [256, 24, 36, 128] + - [227, 2239.84] + - - [256, 256, 64, 2304] + - [257, 10225.7] + - - [192, 384, 36, 512] + - [292, 8549.03] + - - [16, 32, 64, 2560] + - [245, 2153.13] + - - [256, 512, 36, 32] + - [270, 5702.23] + - - [512, 512, 64, 128] + - [292, 9084.11] + - - [128, 128, 64, 200] + - [229, 6971.91] + - - [512, 512, 64, 32] + - [241, 6248.5] + - - [128, 256, 36, 196] + - [241, 6628.76] + - - [8, 384, 64, 6600] + - [273, 2733.89] + - - [149, 32, 64, 8195] + - [235, 6050.91] + - - [35, 96, 64, 6160] + - [280, 4689.35] + - - [64, 64, 36, 1760] + - [230, 5622.24] + - - [196, 528, 32, 32] + - [313, 4088.41] + - - [5329, 64, 32, 80] + - [306, 8331.14] + - - [64, 2880, 1, 320] + - [357, 4362.6] + - - [49, 832, 32, 256] + - [320, 5618.63] + - - [3136, 64, 64, 64] + - [306, 8457.65] + - - [196, 512, 32, 24] + - [307, 3621.73] + - - [289, 1120, 1, 160] + - [303, 3302.86] + - - [1225, 192, 32, 32] + - [311, 6194.57] + - - [64, 2048, 32, 384] + - [334, 9541.54] + - - [1001, 1536, 1, 32] + - [305, 3575.67] + - - [289, 1792, 1, 320] + - [328, 5140.33] + - - [3136, 256, 64, 64] + - [329, 9310.12] + - - [1001, 1024, 1, 32] + - [300, 2733.4] + - - [196, 480, 32, 64] + - [361, 5070.42] + - - [64, 1728, 1, 320] + - [358, 3205.57] + - - [49, 832, 32, 160] + - [362, 4988.82] + - - [49, 2048, 64, 512] + - [332, 7370.31] + - - [49, 832, 32, 384] + - [320, 5901.95] + - - [289, 896, 1, 192] + - [346, 3452.59] + - - [289, 1024, 32, 384] + - [365, 8902.42] + - - [784, 192, 32, 96] + - [376, 7853.63] + - - [50176, 256, 1, 128] + - [339, 9041.83] + - - [289, 1024, 32, 256] + - [374, 8660.72] + - - [289, 1024, 32, 192] + - [363, 8433.35] + - - [12544, 512, 1, 256] + - [323, 9187.34] + - - [1225, 1728, 1, 192] + - [327, 7720.85] + - - [196, 480, 32, 96] + - [372, 5662.5] + - - [196, 512, 32, 144] + - [366, 6531.38] + - - [784, 400, 1, 32] + - [301, 1280.0] + - - [289, 768, 32, 128] + - [367, 7913.61] + - - [5329, 576, 1, 96] + - [310, 7563.46] + - - [49, 1200, 1, 128] + - [354, 1011.61] + - - [64, 1536, 32, 256] + - [368, 9159.54] + - - [289, 2592, 1, 384] + - [336, 6002.71] + - - [196, 528, 32, 128] + - [371, 5987.1] + - - [64, 2048, 32, 448] + - [334, 9669.87] + - - [196, 1024, 64, 256] + - [373, 7818.94] + - - [5329, 448, 1, 64] + - [306, 6201.02] + - - [784, 256, 32, 64] + - [308, 7623.18] + - - [784, 192, 32, 32] + - [313, 5874.26] + - - [21609, 288, 1, 32] + - [326, 5296.5] + - - [784, 256, 32, 32] + - [304, 6235.46] + - - [5041, 720, 1, 192] + - [322, 8140.98] + - - [289, 2016, 1, 256] + - [319, 5404.05] + - - [196, 512, 32, 128] + - [364, 6366.82] + - - [289, 768, 32, 160] + - [366, 8253.88] + - - [64, 1536, 32, 384] + - [337, 9508.5] + - - [64, 1280, 32, 320] + - [337, 9070.73] + - - [289, 896, 1, 128] + - [347, 2917.68] + - - [289, 3456, 1, 384] + - [327, 7274.91] + - - [196, 800, 1, 64] + - [349, 1393.78] + - - [64, 1280, 32, 384] + - [333, 9225.01] + - - [64, 1344, 1, 512] + - [352, 3041.45] + - - [1001, 4096, 1, 512] + - [333, 9391.77] + - - [1225, 192, 32, 64] + - [306, 7729.29] + - - [64, 1152, 1, 384] + - [356, 2440.65] + - - [729, 1600, 1, 192] + - [318, 6827.71] + - - [289, 1344, 1, 192] + - [316, 4439.04] + - - [784, 192, 32, 16] + - [343, 3663.04] + - - [3136, 1024, 1, 2048] + - [325, 9071.77] + - - [64, 1152, 1, 448] + - [353, 2564.45] + - - [49, 832, 32, 128] + - [316, 4733.16] + - - [784, 256, 32, 128] + - [329, 8471.6] + - - [49, 800, 1, 128] + - [351, 633.535] + - - [196, 512, 32, 32] + - [313, 4354.26] + - - [1225, 384, 32, 96] + - [330, 8751.63] + - - [5041, 576, 1, 96] + - [312, 7067.63] + - - [49, 832, 32, 48] + - [345, 3316.72] + - - [3136, 64, 64, 256] + - [367, 9721.9] + - - [5329, 160, 32, 64] + - [369, 8159.84] + - - [1225, 288, 32, 48] + - [359, 6673.65] + - - [4096, 9216, 1, 512] + - [341, 10116.9] + - - [196, 480, 32, 192] + - [370, 6388.46] + - - [64, 1152, 1, 256] + - [357, 1982.6] + - - [3136, 1024, 1, 512] + - [325, 8745.57] + - - [49, 832, 32, 32] + - [344, 2717.87] + - - [784, 192, 32, 64] + - [308, 7216.32] + - - [289, 1024, 32, 128] + - [331, 7970.5] + - - [289, 768, 32, 192] + - [375, 8327.27] + - - [289, 1120, 1, 192] + - [315, 3716.9] + - - [196, 512, 32, 112] + - [321, 6252.81] + - - [1001, 2048, 1, 32] + - [309, 4000.09] + - - [1225, 288, 32, 64] + - [369, 7208.04] + - - [196, 600, 1, 64] + - [348, 1093.95] + - - [1225, 384, 32, 192] + - [330, 9332.66] + - - [50176, 256, 1, 512] + - [340, 9833.54] + - - [196, 512, 32, 160] + - [367, 6614.34] + - - [4096, 4096, 1, 512] + - [338, 10032.2] + - - [49, 832, 32, 192] + - [316, 5244.53] + - - [1225, 256, 32, 64] + - [306, 7972.35] + - - [64, 2048, 32, 320] + - [334, 9404.27] + - - [196, 480, 32, 16] + - [360, 2724.49] + - - [1225, 256, 32, 48] + - [308, 7100.38] + - - [64, 1280, 32, 448] + - [333, 9344.41] + - - [1225, 1200, 1, 64] + - [302, 5157.89] + - - [1225, 384, 32, 64] + - [306, 8219.96] + - - [12544, 512, 1, 1024] + - [325, 9672.72] + - - [64, 1280, 32, 192] + - [321, 8525.01] + - - [196, 512, 32, 64] + - [306, 5489.34] + - - [289, 1792, 1, 256] + - [324, 4831.61] + - - [196, 528, 32, 256] + - [342, 6453.82] + - - [49, 512, 64, 2048] + - [377, 7548.98] + - - [64, 2048, 32, 192] + - [329, 8955.81] + - - [784, 512, 64, 128] + - [329, 9160.73] + - - [784, 128, 64, 512] + - [336, 9280.69] + - - [196, 528, 32, 160] + - [370, 6161.15] + - - [1225, 192, 32, 48] + - [306, 7236.92] + - - [64, 1728, 1, 192] + - [356, 2480.57] + - - [1001, 2048, 1, 64] + - [382, 5714.42] + - - [5329, 64, 128, 80] + - [389, 8835.29] + - - [64, 1280, 128, 448] + - [387, 10020.5] + - - [289, 768, 128, 128] + - [390, 8542.71] + - - [1225, 192, 128, 64] + - [379, 8444.77] + - - [1225, 288, 128, 48] + - [392, 7244.66] + - - [289, 768, 128, 192] + - [394, 8794.49] + - - [289, 768, 128, 160] + - [391, 8705.33] + - - [64, 2048, 128, 192] + - [385, 9780.26] + - - [64, 1280, 128, 384] + - [388, 9950.9] + - - [1225, 256, 128, 48] + - [380, 8273.61] + - - [1225, 192, 128, 48] + - [380, 8140.32] + - - [1225, 288, 128, 64] + - [392, 7886.21] + - - [64, 1280, 128, 320] + - [384, 9894.56] + - - [1225, 256, 128, 64] + - [385, 8572.51] + - - [1001, 2048, 1, 128] + - [386, 7289.06] + - - [1225, 192, 128, 32] + - [381, 7104.57] + - - [64, 1280, 128, 192] + - [393, 9642.08] + - - [1001, 1536, 1, 64] + - [383, 5146.56] - - [1024, 128, 1, 128] - - [170, 1028.12] + - [399, 1028.12] - - [4, 704, 1, 1280] - - [209, 363.455] + - [438, 363.455] - - [4, 1856, 1, 3328] - - [209, 579.534] + - [438, 579.534] - - [1856, 448, 1, 3328] - - [246, 6966.83] + - [475, 6966.83] - - [2944, 4288, 1, 1280] - - [241, 9057.98] + - [470, 9057.98] - - [2368, 64, 1, 3328] - - [202, 5837.66] + - [431, 5837.66] - - [2368, 5888, 1, 256] - - [246, 9111.16] + - [475, 9111.16] - - [128, 64, 1, 256] - - [208, 374.591] + - [437, 374.591] - - [5888, 1024, 1, 1280] - - [251, 8570.54] + - [480, 8570.54] - - [128, 6784, 1, 3328] - - [214, 7703.96] + - [443, 7703.96] - - [64, 4, 1, 256] - - [260, 11.3219] + - [489, 11.3219] - - [5888, 1856, 1, 3328] - - [246, 9394.4] + - [475, 9394.4] - - [5056, 704, 1, 256] - - [249, 8026.99] + - [478, 8026.99] - - [5888, 2944, 1, 3328] - - [239, 7608.21] + - [468, 7608.21] - - [1856, 4288, 1, 256] - - [240, 8986.42] + - [469, 8986.42] - - [1024, 5056, 1, 128] - - [232, 3898.34] + - [461, 3898.34] - - [5056, 5056, 1, 3328] - - [240, 9536.85] + - [469, 9536.85] - - [1408, 5888, 1, 1280] - - [241, 9279.19] + - [470, 9279.19] - - [2368, 448, 1, 128] - - [233, 2474.42] + - [462, 2474.42] - - [1024, 3584, 1, 3328] - - [243, 9258.58] + - [472, 9258.58] - - [4, 2944, 1, 1280] - - [195, 611.84] + - [424, 611.84] - - [1408, 64, 1, 128] - - [166, 858.31] + - [395, 858.31] - - [256, 4288, 1, 3328] - - [246, 7616.08] + - [475, 7616.08] - - [5888, 1408, 1, 1280] - - [239, 9620.39] + - [468, 9620.39] - - [704, 1856, 1, 3328] - - [240, 9033.75] + - [469, 9033.75] - - [4, 1408, 1, 128] - - [253, 24.455] + - [482, 24.455] - - [1024, 2368, 1, 256] - - [240, 7526.25] + - [469, 7526.25] - - [1408, 1856, 1, 1280] - - [243, 8324.19] + - [472, 8324.19] - - [1408, 64, 1, 1280] - - [214, 4681.24] + - [443, 4681.24] - - [448, 1024, 1, 1280] - - [240, 7112.53] + - [469, 7112.53] - - [256, 1408, 1, 3328] - - [246, 5825.51] + - [475, 5825.51] - - [5056, 5056, 1, 1280] - - [249, 9233.65] + - [478, 9233.65] - - [448, 5056, 1, 256] - - [241, 7003.27] + - [470, 7003.27] - - [704, 1856, 1, 1280] - - [240, 8877.38] + - [469, 8877.38] - - [128, 5056, 1, 128] - - [232, 2301.14] + - [461, 2301.14] - - [2368, 128, 1, 256] - - [240, 3849.04] + - [469, 3849.04] - - [1856, 1408, 1, 128] - - [235, 4202.31] + - [464, 4202.31] - - [64, 5056, 1, 256] - - [241, 3109.62] + - [470, 3109.62] - - [6784, 256, 1, 3328] - - [240, 6388.53] + - [469, 6388.53] - - [6784, 4288, 1, 3328] - - [251, 9114.67] + - [480, 9114.67] - - [4288, 448, 1, 256] - - [244, 5783.05] + - [473, 5783.05] - - [64, 704, 1, 128] - - [177, 379.519] + - [406, 379.519] - - [1856, 2368, 1, 3328] - - [240, 9128.46] + - [469, 9128.46] - - [4288, 2944, 1, 1280] - - [246, 9182.33] + - [475, 9182.33] - - [704, 5056, 1, 1280] - - [240, 9071.57] + - [469, 9071.57] - - [2368, 704, 1, 3328] - - [246, 7731.43] + - [475, 7731.43] - - [256, 5888, 1, 256] - - [240, 7920.38] + - [469, 7920.38] - - [1856, 4288, 1, 3328] - - [246, 9330.07] + - [475, 9330.07] - - [256, 2944, 1, 256] - - [247, 5312.27] + - [476, 5312.27] - - [5888, 1024, 1, 256] - - [238, 6710.97] + - [467, 6710.97] - - [448, 64, 1, 1280] - - [213, 2814.53] + - [442, 2814.53] - - [448, 5056, 1, 3328] - - [240, 8255.53] + - [469, 8255.53] - - [3584, 4, 1, 1280] - - [189, 640.815] + - [418, 640.815] - - [2944, 64, 1, 256] - - [188, 2621.54] + - [417, 2621.54] - - [128, 4, 1, 1280] - - [260, 86.3316] + - [489, 86.3316] - - [1408, 2944, 1, 256] - - [240, 8848.99] + - [469, 8848.99] - - [256, 1856, 1, 1280] - - [240, 7366.55] + - [469, 7366.55] - - [6784, 5056, 1, 3328] - - [251, 8332.16] + - [480, 8332.16] - - [5056, 5056, 1, 256] - - [246, 9171.74] + - [475, 9171.74] - - [1408, 6784, 1, 128] - - [232, 5079.19] + - [461, 5079.19] - - [64, 1024, 1, 1280] - - [204, 3679.31] + - [433, 3679.31] - - [2944, 4, 1, 256] - - [195, 369.543] + - [424, 369.543] - - [704, 5056, 1, 128] - - [232, 4509.27] + - [461, 4509.27] - - [4, 2368, 1, 1280] - - [189, 569.844] + - [418, 569.844] - - [2368, 2944, 1, 1280] - - [251, 7451.14] + - [480, 7451.14] - - [128, 3584, 1, 1280] - - [249, 6071.26] + - [478, 6071.26] - - [6784, 6784, 1, 1280] - - [246, 9535.74] - - - [1024, 256, 1, 3328] - - [240, 5742.68] + - [475, 9535.74] - - [1408, 4288, 1, 1280] - - [249, 8255.09] + - [478, 8255.09] - - [3584, 4288, 1, 1280] - - [251, 9651.19] + - [480, 9651.19] - - [2368, 704, 1, 1280] - - [246, 8291.4] + - [475, 8291.4] - - [5056, 4288, 1, 3328] - - [238, 9406.36] + - [467, 9406.36] - - [3584, 2368, 1, 3328] - - [246, 9350.32] + - [475, 9350.32] - - [64, 704, 1, 1280] - - [213, 3384.59] + - [442, 3384.59] - - [4288, 256, 1, 256] - - [246, 5593.62] + - [475, 5593.62] - - [2944, 128, 1, 128] - - [168, 2130.6] + - [397, 2130.6] - - [6784, 448, 1, 1280] - - [249, 8815.85] + - [478, 8815.85] - - [1408, 2944, 1, 128] - - [232, 4558.34] + - [461, 4558.34] - - [4288, 2944, 1, 256] - - [251, 7865.43] + - [480, 7865.43] - - [5888, 704, 1, 1280] - - [240, 9262.99] + - [469, 9262.99] - - [1856, 64, 1, 1280] - - [214, 4359.15] + - [443, 4359.15] - - [448, 5888, 1, 128] - - [235, 4000.59] + - [464, 4000.59] - - [5888, 64, 1, 3328] - - [215, 6603.39] + - [444, 6603.39] - - [2944, 256, 1, 3328] - - [240, 8423.63] + - [469, 8423.63] - - [1024, 64, 1, 128] - - [185, 582.642] + - [414, 582.642] - - [5056, 2368, 1, 1280] - - [240, 9419.91] + - [469, 9419.91] - - [448, 3584, 1, 1280] - - [240, 7985.82] + - [469, 7985.82] - - [6784, 5888, 1, 256] - - [238, 9494.36] + - [467, 9494.36] - - [704, 1024, 1, 128] - - [232, 2813.35] + - [461, 2813.35] - - [704, 128, 1, 1280] - - [214, 4477.71] + - [443, 4477.71] - - [5888, 2944, 1, 128] - - [235, 4745.96] + - [464, 4745.96] - - [4, 3584, 1, 128] - - [252, 96.479] + - [481, 96.479] - - [1408, 448, 1, 1280] - - [240, 6912.8] + - [469, 6912.8] - - [1024, 1408, 1, 256] - - [248, 5810.85] + - [477, 5810.85] - - [2368, 2368, 1, 3328] - - [249, 9088.71] + - [478, 9088.71] - - [1856, 6784, 1, 128] - - [235, 5168.32] + - [464, 5168.32] - - [5056, 704, 1, 3328] - - [241, 7464.9] + - [470, 7464.9] - - [1408, 1856, 1, 256] - - [246, 6727.69] + - [475, 6727.69] - - [1408, 704, 1, 3328] - - [246, 8379.53] + - [475, 8379.53] - - [2368, 5056, 1, 256] - - [246, 8664.11] + - [475, 8664.11] - - [5888, 1856, 1, 256] - - [251, 5810.02] + - [480, 5810.02] - - [4288, 64, 1, 3328] - - [228, 6583.94] + - [457, 6583.94] - - [2368, 4, 1, 1280] - - [261, 545.251] + - [490, 545.251] - - [704, 5888, 1, 256] - - [246, 8813.71] + - [475, 8813.71] - - [4288, 64, 1, 256] - - [204, 3059.97] + - [433, 3059.97] - - [6784, 64, 1, 256] - - [246, 3490.96] + - [475, 3490.96] - - [2944, 256, 1, 256] - - [240, 6970.4] + - [469, 6970.4] - - [2944, 6784, 1, 3328] - - [240, 9475.79] + - [469, 9475.79] - - [704, 1408, 1, 3328] - - [240, 8154.18] + - [469, 8154.18] - - [3584, 704, 1, 3328] - - [240, 8995.07] + - [469, 8995.07] - - [2944, 256, 1, 128] - - [232, 2824.13] + - [461, 2824.13] - - [6784, 4, 1, 1280] - - [189, 625.714] + - [418, 625.714] - - [1024, 64, 1, 1280] - - [201, 3307.91] + - [430, 3307.91] - - [448, 4288, 1, 256] - - [246, 6074.48] + - [475, 6074.48] - - [64, 3584, 1, 3328] - - [194, 6200.26] + - [423, 6200.26] - - [704, 2368, 1, 1280] - - [240, 8291.4] + - [469, 8291.4] - - [448, 2944, 1, 128] - - [232, 3221.87] + - [461, 3221.87] - - [1856, 2368, 1, 1280] - - [251, 6855.24] + - [480, 6855.24] - - [2368, 128, 1, 3328] - - [202, 6479.61] + - [431, 6479.61] - - [2944, 128, 1, 256] - - [240, 3828.23] + - [469, 3828.23] - - [448, 1408, 1, 256] - - [241, 4525.9] + - [470, 4525.9] - - [1856, 4288, 1, 1280] - - [239, 9160.32] + - [468, 9160.32] - - [64, 5056, 1, 3328] - - [222, 6819.3] + - [451, 6819.3] - - [4, 704, 1, 256] - - [206, 123.541] + - [435, 123.541] - - [1024, 448, 1, 128] - - [235, 1989.27] + - [464, 1989.27] - - [704, 4, 1, 1280] - - [209, 381.931] + - [438, 381.931] - - [704, 256, 1, 128] - - [232, 1109.17] + - [461, 1109.17] - - [704, 2944, 1, 128] - - [232, 4089.03] + - [461, 4089.03] - - [1408, 1024, 1, 1280] - - [246, 8192.08] + - [475, 8192.08] - - [704, 6784, 1, 256] - - [240, 6717.9] + - [469, 6717.9] - - [6784, 704, 1, 256] - - [246, 5429.22] + - [475, 5429.22] - - [5056, 1408, 1, 128] - - [232, 4954.5] + - [461, 4954.5] - - [256, 3584, 1, 3328] - - [240, 7890.96] + - [469, 7890.96] - - [4, 5888, 1, 3328] - - [257, 691.047] + - [486, 691.047] - - [128, 1408, 1, 128] - - [179, 1393.14] + - [408, 1393.14] - - [3584, 4288, 1, 3328] - - [242, 8900.87] + - [471, 8900.87] - - [5888, 1856, 1, 1280] - - [243, 9345.85] + - [472, 9345.85] - - [5056, 1024, 1, 3328] - - [244, 7834.84] + - [473, 7834.84] - - [5056, 64, 1, 1280] - - [222, 5890.14] + - [451, 5890.14] - - [1024, 704, 1, 256] - - [240, 6007.57] + - [469, 6007.57] - - [1024, 4288, 1, 128] - - [234, 3497.09] + - [463, 3497.09] - - [4288, 64, 1, 1280] - - [219, 4726.59] + - [448, 4726.59] - - [2368, 3584, 1, 1280] - - [238, 8128.82] + - [467, 8128.82] - - [2368, 6784, 1, 1280] - - [238, 9478.72] + - [467, 9478.72] - - [1024, 256, 1, 256] - - [246, 4092.1] + - [475, 4092.1] - - [1856, 4, 1, 1280] - - [261, 509.903] + - [490, 509.903] - - [448, 448, 1, 256] - - [246, 3001.28] + - [475, 3001.28] - - [2944, 3584, 1, 3328] - - [247, 9081.91] + - [476, 9081.91] - - [128, 4288, 1, 128] - - [167, 2323.33] + - [396, 2323.33] - - [64, 448, 1, 256] - - [210, 1066.97] + - [439, 1066.97] - - [128, 1024, 1, 3328] - - [223, 6392.36] + - [452, 6392.36] - - [4, 1408, 1, 3328] - - [206, 616.656] + - [435, 616.656] - - [6784, 2944, 1, 256] - - [249, 8547.73] + - [478, 8547.73] - - [64, 1856, 1, 1280] - - [222, 4409.71] + - [451, 4409.71] - - [64, 1024, 1, 128] - - [166, 554.902] + - [395, 554.902] - - [4288, 2368, 1, 3328] - - [242, 8780.08] + - [471, 8780.08] - - [1856, 2368, 1, 256] - - [249, 4976.74] + - [478, 4976.74] - - [3584, 256, 1, 128] - - [234, 2812.37] + - [463, 2812.37] - - [3584, 6784, 1, 3328] - - [244, 9278.22] + - [473, 9278.22] - - [256, 1024, 1, 256] - - [240, 4346.53] + - [469, 4346.53] - - [4, 6784, 1, 3328] - - [259, 681.366] + - [488, 681.366] - - [1024, 5888, 1, 3328] - - [240, 9187.61] + - [469, 9187.61] - - [1024, 128, 1, 1280] - - [192, 3660.05] + - [421, 3660.05] - - [4288, 128, 1, 1280] - - [246, 6019.17] + - [475, 6019.17] - - [5056, 4288, 1, 1280] - - [238, 9343.96] + - [467, 9343.96] - - [5888, 64, 1, 256] - - [240, 4692.17] + - [469, 4692.17] - - [1856, 256, 1, 1280] - - [246, 4790.38] + - [475, 4790.38] - - [64, 5888, 1, 3328] - - [214, 6702.2] + - [443, 6702.2] - - [2944, 5888, 1, 128] - - [235, 5202.65] + - [464, 5202.65] - - [704, 5888, 1, 1280] - - [240, 9264.29] + - [469, 9264.29] - - [2368, 3584, 1, 128] - - [232, 5053.71] + - [461, 5053.71] - - [6784, 5888, 1, 3328] - - [238, 7926.8] + - [467, 7926.8] - - [704, 1024, 1, 1280] - - [239, 5402.6] + - [468, 5402.6] - - [448, 256, 1, 3328] - - [222, 6124.65] + - [451, 6124.65] - - [448, 1856, 1, 128] - - [233, 2885.96] + - [462, 2885.96] - - [128, 1024, 1, 128] - - [167, 1013.22] + - [396, 1013.22] - - [2944, 4, 1, 128] - - [252, 77.6374] + - [481, 77.6374] - - [1024, 704, 1, 1280] - - [240, 7365.58] + - [469, 7365.58] - - [128, 5888, 1, 256] - - [240, 6990.61] + - [469, 6990.61] - - [1024, 5056, 1, 1280] - - [245, 9422.0] + - [474, 9422.0] - - [4288, 1024, 1, 256] - - [247, 6270.03] + - [476, 6270.03] - - [2944, 2368, 1, 128] - - [232, 4918.18] + - [461, 4918.18] - - [704, 704, 1, 3328] - - [240, 7963.65] + - [469, 7963.65] - - [704, 1408, 1, 1280] - - [240, 8347.32] + - [469, 8347.32] - - [5888, 448, 1, 1280] - - [246, 5217.05] + - [475, 5217.05] - - [3584, 256, 1, 3328] - - [240, 7802.25] + - [469, 7802.25] - - [704, 5888, 1, 3328] - - [246, 8381.46] + - [475, 8381.46] - - [704, 1856, 1, 128] - - [232, 3598.38] + - [461, 3598.38] - - [128, 3584, 1, 3328] - - [202, 7161.11] + - [431, 7161.11] - - [6784, 2368, 1, 1280] - - [251, 9464.41] + - [480, 9464.41] - - [4, 4288, 1, 128] - - [252, 132.68] + - [481, 132.68] - - [128, 704, 1, 1280] - - [214, 4463.85] + - [443, 4463.85] - - [3584, 2944, 1, 256] - - [251, 8201.24] + - [480, 8201.24] - - [1856, 128, 1, 3328] - - [193, 6575.5] + - [422, 6575.5] - - [4, 64, 1, 1280] - - [209, 43.6745] + - [438, 43.6745] - - [4, 5056, 1, 3328] - - [189, 675.315] + - [418, 675.315] - - [128, 2944, 1, 1280] - - [193, 5916.99] + - [422, 5916.99] - - [2368, 1024, 1, 3328] - - [246, 8646.84] + - [475, 8646.84] - - [128, 256, 1, 3328] - - [227, 4130.85] + - [456, 4130.85] - - [1408, 5056, 1, 3328] - - [245, 9529.75] + - [474, 9529.75] - - [1856, 1856, 1, 3328] - - [244, 8114.99] + - [473, 8114.99] - - [3584, 128, 1, 256] - - [240, 5603.18] + - [469, 5603.18] - - [448, 1408, 1, 3328] - - [240, 7073.03] + - [469, 7073.03] - - [2368, 2368, 1, 256] - - [247, 7648.76] + - [476, 7648.76] - - [4288, 4288, 1, 1280] - - [242, 9244.11] + - [471, 9244.11] - - [64, 448, 1, 1280] - - [213, 2885.33] + - [442, 2885.33] - - [1408, 4288, 1, 256] - - [240, 8080.41] + - [469, 8080.41] - - [448, 4, 1, 256] - - [258, 84.4294] + - [487, 84.4294] - - [5888, 448, 1, 128] - - [235, 3540.8] + - [464, 3540.8] - - [448, 4, 1, 1280] - - [209, 322.257] + - [438, 322.257] - - [704, 6784, 1, 3328] - - [239, 8613.58] + - [468, 8613.58] - - [5888, 5888, 1, 1280] - - [246, 9502.05] + - [475, 9502.05] - - [5056, 1024, 1, 1280] - - [249, 9110.11] + - [478, 9110.11] - - [448, 5888, 1, 3328] - - [240, 8586.43] + - [469, 8586.43] - - [128, 4, 1, 128] - - [252, 4.27959] + - [481, 4.27959] - - [1024, 2944, 1, 1280] - - [248, 7096.53] + - [477, 7096.53] - - [5056, 5888, 1, 1280] - - [239, 9693.51] + - [468, 9693.51] - - [4288, 5888, 1, 128] - - [232, 5406.46] + - [461, 5406.46] - - [256, 3584, 1, 256] - - [240, 6908.37] + - [469, 6908.37] - - [1408, 3584, 1, 128] - - [232, 4645.69] + - [461, 4645.69] - - [256, 2944, 1, 3328] - - [243, 6284.4] + - [472, 6284.4] - - [448, 3584, 1, 128] - - [235, 3675.37] + - [464, 3675.37] - - [5888, 2944, 1, 1280] - - [245, 9628.9] + - [474, 9628.9] - - [4, 6784, 1, 1280] - - [189, 688.176] + - [418, 688.176] - - [2368, 5888, 1, 128] - - [232, 5273.96] + - [461, 5273.96] - - [64, 2944, 1, 128] - - [176, 1316.54] + - [405, 1316.54] - - [3584, 5888, 1, 256] - - [246, 9239.14] + - [475, 9239.14] - - [2368, 704, 1, 128] - - [235, 3537.65] + - [464, 3537.65] - - [3584, 2944, 1, 1280] - - [240, 9324.62] + - [469, 9324.62] - - [3584, 2368, 1, 128] - - [232, 4766.34] + - [461, 4766.34] - - [5056, 704, 1, 128] - - [232, 4487.95] + - [461, 4487.95] - - [448, 2368, 1, 128] - - [235, 2877.02] + - [464, 2877.02] - - [5056, 1408, 1, 3328] - - [251, 9515.97] + - [480, 9515.97] - - [1408, 704, 1, 256] - - [243, 6836.18] + - [472, 6836.18] - - [6784, 1024, 1, 3328] - - [238, 9309.65] + - [467, 9309.65] - - [6784, 2944, 1, 3328] - - [239, 9536.58] + - [468, 9536.58] - - [2944, 5056, 1, 3328] - - [240, 9526.25] + - [469, 9526.25] - - [1856, 1856, 1, 256] - - [240, 5239.24] + - [469, 5239.24] - - [1024, 5888, 1, 128] - - [232, 4006.28] + - [461, 4006.28] - - [2048, 7133, 1, 2048] - - [238, 9828.07] + - [467, 9828.07] - - [256, 4, 1, 128] - - [253, 4.38908] + - [482, 4.38908] - - [4288, 5888, 1, 1280] - - [248, 9202.83] + - [477, 9202.83] - - [4288, 4288, 1, 256] - - [243, 5521.18] + - [472, 5521.18] - - [448, 2944, 1, 3328] - - [246, 7724.53] + - [475, 7724.53] - - [4288, 1856, 1, 1280] - - [246, 8826.34] + - [475, 8826.34] - - [1856, 2944, 1, 3328] - - [240, 9194.9] + - [469, 9194.9] - - [256, 6784, 1, 3328] - - [240, 8740.33] + - [469, 8740.33] - - [64, 5888, 1, 256] - - [240, 4766.35] + - [469, 4766.35] - - [256, 5056, 1, 128] - - [232, 2937.6] + - [461, 2937.6] - - [5056, 1024, 1, 256] - - [251, 5467.91] + - [480, 5467.91] - - [704, 64, 1, 3328] - - [228, 4818.43] + - [457, 4818.43] - - [5056, 1856, 1, 3328] - - [245, 8861.69] + - [474, 8861.69] - - [4, 2944, 1, 3328] - - [195, 662.102] + - [424, 662.102] - - [4, 5056, 1, 256] - - [255, 494.121] + - [484, 494.121] - - [1856, 1408, 1, 256] - - [240, 8674.78] + - [469, 8674.78] - - [3584, 4, 1, 128] - - [252, 108.296] + - [481, 108.296] - - [448, 448, 1, 3328] - - [214, 6457.4] + - [443, 6457.4] - - [6784, 128, 1, 3328] - - [207, 7256.71] + - [436, 7256.71] - - [4288, 1408, 1, 128] - - [235, 4791.76] + - [464, 4791.76] - - [4288, 5056, 1, 256] - - [240, 8560.84] + - [469, 8560.84] - - [1408, 128, 1, 1280] - - [222, 5085.79] + - [451, 5085.79] - - [5056, 256, 1, 3328] - - [243, 7284.23] + - [472, 7284.23] - - [704, 704, 1, 256] - - [240, 6171.19] + - [469, 6171.19] - - [1024, 5888, 1, 1280] - - [245, 8852.89] + - [474, 8852.89] - - [6784, 2368, 1, 128] - - [233, 4729.3] + - [462, 4729.3] - - [4, 5056, 1, 1280] - - [206, 670.046] + - [435, 670.046] - - [64, 128, 1, 256] - - [208, 369.317] + - [437, 369.317] - - [128, 1856, 1, 1280] - - [202, 5549.13] + - [431, 5549.13] - - [5056, 3584, 1, 256] - - [246, 7115.84] + - [475, 7115.84] - - [1856, 1024, 1, 1280] - - [238, 8196.5] + - [467, 8196.5] - - [6784, 4288, 1, 1280] - - [239, 9509.66] + - [468, 9509.66] - - [1856, 1856, 1, 1280] - - [241, 5791.99] + - [470, 5791.99] - - [6784, 2944, 1, 128] - - [232, 5317.12] + - [461, 5317.12] - - [1408, 5056, 1, 1280] - - [241, 8980.73] + - [470, 8980.73] - - [4, 2368, 1, 3328] - - [206, 592.634] + - [435, 592.634] - - [5888, 1856, 1, 128] - - [231, 4600.2] + - [460, 4600.2] - - [448, 704, 1, 1280] - - [240, 2286.58] + - [469, 2286.58] - - [2368, 1024, 1, 128] - - [235, 3911.12] + - [464, 3911.12] - - [1024, 448, 1, 3328] - - [240, 7295.24] + - [469, 7295.24] - - [1856, 704, 1, 1280] - - [240, 8881.12] + - [469, 8881.12] - - [5056, 3584, 1, 128] - - [232, 4911.68] + - [461, 4911.68] - - [5888, 5888, 1, 3328] - - [248, 9243.9] + - [477, 9243.9] - - [6784, 1024, 1, 256] - - [251, 5475.41] + - [480, 5475.41] - - [2944, 2368, 1, 256] - - [246, 5670.77] + - [475, 5670.77] - - [256, 448, 1, 256] - - [197, 2293.86] + - [426, 2293.86] - - [5056, 5888, 1, 3328] - - [241, 7848.07] + - [470, 7848.07] - - [1856, 1024, 1, 256] - - [246, 7517.7] + - [475, 7517.7] - - [448, 1408, 1, 1280] - - [240, 6917.54] + - [469, 6917.54] - - [3584, 448, 1, 1280] - - [246, 7980.86] + - [475, 7980.86] - - [1024, 1024, 1, 1280] - - [243, 8384.52] + - [472, 8384.52] - - [448, 5888, 1, 256] - - [240, 7365.75] + - [469, 7365.75] - - [704, 64, 1, 128] - - [185, 358.755] + - [414, 358.755] - - [1408, 6784, 1, 3328] - - [246, 9094.19] + - [475, 9094.19] - - [448, 1024, 1, 128] - - [235, 1773.05] + - [464, 1773.05] - - [4288, 704, 1, 128] - - [232, 4355.38] + - [461, 4355.38] - - [128, 1856, 1, 128] - - [171, 1610.73] + - [400, 1610.73] - - [448, 2368, 1, 3328] - - [246, 7366.47] + - [475, 7366.47] - - [5056, 64, 1, 128] - - [171, 2157.33] + - [400, 2157.33] - - [5056, 2944, 1, 256] - - [240, 9123.16] + - [469, 9123.16] - - [6784, 5888, 1, 128] - - [231, 5285.9] + - [460, 5285.9] - - [704, 1024, 1, 256] - - [246, 6667.35] + - [475, 6667.35] - - [1024, 4, 1, 256] - - [195, 187.346] + - [424, 187.346] - - [2368, 1856, 1, 256] - - [246, 6777.94] + - [475, 6777.94] - - [128, 6784, 1, 1280] - - [243, 7052.71] + - [472, 7052.71] - - [1408, 3584, 1, 3328] - - [247, 9038.05] + - [476, 9038.05] - - [2368, 6784, 1, 256] - - [240, 9181.45] + - [469, 9181.45] - - [5056, 1408, 1, 1280] - - [245, 9422.0] + - [474, 9422.0] - - [256, 256, 1, 128] - - [177, 543.404] + - [406, 543.404] - - [5056, 4288, 1, 128] - - [235, 5340.02] + - [464, 5340.02] - - [1408, 1856, 1, 128] - - [232, 4270.99] + - [461, 4270.99] - - [1408, 5888, 1, 3328] - - [244, 9034.89] + - [473, 9034.89] - - [1856, 256, 1, 256] - - [246, 5847.93] + - [475, 5847.93] - - [6784, 6784, 1, 256] - - [239, 9624.48] + - [468, 9624.48] - - [64, 256, 1, 128] - - [178, 146.549] + - [407, 146.549] - - [4288, 2368, 1, 128] - - [231, 3897.04] + - [460, 3897.04] - - [1856, 4288, 1, 128] - - [232, 4337.17] + - [461, 4337.17] - - [256, 4288, 1, 1280] - - [240, 7499.52] + - [469, 7499.52] - - [2368, 2944, 1, 256] - - [245, 7703.28] + - [474, 7703.28] - - [4, 1856, 1, 256] - - [258, 264.064] + - [487, 264.064] - - [3584, 1856, 1, 1280] - - [240, 9224.43] + - [469, 9224.43] - - [6784, 6784, 1, 128] - - [232, 5476.13] + - [461, 5476.13] - - [256, 1856, 1, 128] - - [235, 1858.82] + - [464, 1858.82] - - [704, 64, 1, 1280] - - [213, 3368.77] + - [442, 3368.77] - - [5888, 5056, 1, 256] - - [246, 5859.91] + - [475, 5859.91] - - [3584, 448, 1, 256] - - [246, 7298.43] + - [475, 7298.43] - - [448, 4288, 1, 128] - - [232, 3813.55] + - [461, 3813.55] - - [2944, 4288, 1, 3328] - - [241, 9149.73] + - [470, 9149.73] - - [256, 6784, 1, 256] - - [240, 7984.95] + - [469, 7984.95] - - [1408, 4288, 1, 128] - - [235, 4728.44] + - [464, 4728.44] - - [2944, 704, 1, 3328] - - [246, 7149.86] + - [475, 7149.86] - - [128, 448, 1, 256] - - [212, 1699.18] + - [441, 1699.18] - - [512, 32, 1, 512] - - [212, 1127.6] + - [441, 1127.6] - - [3584, 3584, 1, 256] - - [241, 8558.11] + - [470, 8558.11] - - [448, 1408, 1, 128] - - [232, 2504.45] + - [461, 2504.45] - - [128, 256, 1, 1280] - - [213, 3216.59] + - [442, 3216.59] - - [3584, 5056, 1, 256] - - [238, 5674.45] + - [467, 5674.45] - - [6784, 128, 1, 256] - - [240, 6216.49] + - [469, 6216.49] - - [4288, 4, 1, 256] - - [256, 435.706] + - [485, 435.706] - - [64, 1408, 1, 3328] - - [214, 6186.01] + - [443, 6186.01] - - [704, 448, 1, 256] - - [246, 4005.08] + - [475, 4005.08] - - [2944, 2368, 1, 1280] - - [247, 8542.8] + - [476, 8542.8] - - [448, 64, 1, 3328] - - [227, 3835.33] + - [456, 3835.33] - - [1408, 3584, 1, 256] - - [240, 8714.63] + - [469, 8714.63] - - [3584, 4, 1, 3328] - - [195, 689.554] + - [424, 689.554] - - [6784, 3584, 1, 256] - - [245, 9271.34] + - [474, 9271.34] - - [256, 128, 1, 128] - - [178, 283.499] + - [407, 283.499] - - [704, 1408, 1, 128] - - [232, 3210.57] + - [461, 3210.57] - - [4, 2368, 1, 256] - - [258, 360.938] + - [487, 360.938] - - [2944, 448, 1, 128] - - [232, 3344.41] + - [461, 3344.41] - - [128, 1408, 1, 256] - - [240, 3186.38] + - [469, 3186.38] - - [4, 2944, 1, 256] - - [256, 384.622] + - [485, 384.622] - - [64, 128, 1, 3328] - - [209, 2103.72] + - [438, 2103.72] - - [5056, 2368, 1, 128] - - [232, 5219.76] + - [461, 5219.76] - - [2944, 2944, 1, 3328] - - [249, 9174.69] + - [478, 9174.69] - - [5056, 6784, 1, 256] - - [251, 8992.36] + - [480, 8992.36] - - [1856, 3584, 1, 128] - - [232, 4957.27] + - [461, 4957.27] - - [128, 2944, 1, 128] - - [170, 2241.48] + - [399, 2241.48] - - [1024, 704, 1, 3328] - - [250, 6545.11] + - [479, 6545.11] - - [6784, 448, 1, 256] - - [246, 5379.25] + - [475, 5379.25] - - [3584, 6784, 1, 128] - - [232, 5102.01] + - [461, 5102.01] - - [128, 4288, 1, 256] - - [240, 5211.86] + - [469, 5211.86] - - [704, 448, 1, 3328] - - [241, 4504.15] + - [470, 4504.15] - - [1024, 1024, 1, 3328] - - [243, 8009.77] + - [472, 8009.77] - - [128, 128, 1, 3328] - - [226, 3185.03] + - [455, 3185.03] - - [5056, 1856, 1, 256] - - [240, 9138.43] + - [469, 9138.43] - - [256, 128, 1, 256] - - [212, 1205.36] + - [441, 1205.36] - - [1024, 1856, 1, 256] - - [251, 6375.09] + - [480, 6375.09] - - [4288, 64, 1, 128] - - [168, 1695.43] + - [397, 1695.43] - - [256, 448, 1, 3328] - - [215, 5659.67] + - [444, 5659.67] - - [1408, 6784, 1, 1280] - - [240, 9349.2] + - [469, 9349.2] - - [3584, 3584, 1, 1280] - - [245, 9302.19] + - [474, 9302.19] - - [64, 2368, 1, 1280] - - [214, 4433.07] + - [443, 4433.07] - - [448, 2368, 1, 1280] - - [240, 7250.77] + - [469, 7250.77] - - [5888, 5888, 1, 128] - - [232, 4616.03] + - [461, 4616.03] - - [64, 6784, 1, 3328] - - [246, 6987.23] + - [475, 6987.23] - - [2944, 256, 1, 1280] - - [249, 6127.45] + - [478, 6127.45] - - [5056, 5888, 1, 128] - - [231, 5106.39] + - [460, 5106.39] - - [256, 2368, 1, 128] - - [232, 2141.23] + - [461, 2141.23] - - [5056, 2368, 1, 3328] - - [243, 9041.75] + - [472, 9041.75] - - [2944, 4288, 1, 256] - - [251, 8691.22] + - [480, 8691.22] - - [1408, 3584, 1, 1280] - - [240, 9070.0] + - [469, 9070.0] - - [2368, 64, 1, 256] - - [212, 2412.87] + - [441, 2412.87] - - [64, 448, 1, 3328] - - [227, 3739.14] + - [456, 3739.14] - - [256, 256, 1, 3328] - - [214, 5304.18] + - [443, 5304.18] - - [5888, 4, 1, 128] - - [253, 105.655] + - [482, 105.655] - - [1856, 704, 1, 256] - - [240, 8025.43] + - [469, 8025.43] - - [4, 4288, 1, 1280] - - [187, 579.07] + - [416, 579.07] - - [1408, 448, 1, 3328] - - [248, 5714.51] + - [477, 5714.51] - - [1024, 4, 1, 3328] - - [206, 608.649] + - [435, 608.649] - - [2368, 256, 1, 256] - - [246, 5173.08] + - [475, 5173.08] - - [2368, 6784, 1, 3328] - - [246, 9456.61] + - [475, 9456.61] - - [1856, 1408, 1, 1280] - - [251, 7805.19] + - [480, 7805.19] - - [1856, 448, 1, 1280] - - [238, 6185.04] + - [467, 6185.04] - - [6784, 704, 1, 128] - - [232, 4597.87] + - [461, 4597.87] - - [4, 4, 1, 256] - - [209, 0.791892] + - [438, 0.791892] - - [128, 5888, 1, 128] - - [170, 2691.76] + - [399, 2691.76] - - [1408, 5888, 1, 256] - - [245, 7164.27] + - [474, 7164.27] - - [704, 2944, 1, 1280] - - [247, 8139.81] + - [476, 8139.81] - - [1856, 2368, 1, 128] - - [235, 4623.38] + - [464, 4623.38] - - [4096, 7133, 1, 4096] - - [239, 9940.07] + - [468, 9940.07] - - [256, 64, 1, 256] - - [203, 689.953] + - [432, 689.953] - - [1024, 1024, 1, 256] - - [246, 7216.11] + - [475, 7216.11] - - [704, 1856, 1, 256] - - [246, 6364.17] + - [475, 6364.17] - - [128, 4288, 1, 3328] - - [202, 7200.59] + - [431, 7200.59] - - [3584, 704, 1, 1280] - - [249, 7972.08] + - [478, 7972.08] - - [256, 128, 1, 1280] - - [200, 2702.62] + - [429, 2702.62] - - [2368, 4, 1, 256] - - [195, 326.018] + - [424, 326.018] - - [256, 2368, 1, 1280] - - [240, 6638.93] + - [469, 6638.93] - - [2944, 6784, 1, 128] - - [231, 5233.53] + - [460, 5233.53] - - [3584, 448, 1, 3328] - - [240, 8094.4] + - [469, 8094.4] - - [1408, 4, 1, 256] - - [258, 243.646] + - [487, 243.646] - - [704, 2368, 1, 3328] - - [240, 8403.11] + - [469, 8403.11] - - [2944, 448, 1, 256] - - [240, 7022.59] + - [469, 7022.59] - - [1856, 448, 1, 128] - - [235, 2842.79] + - [464, 2842.79] - - [2368, 128, 1, 1280] - - [222, 5685.52] + - [451, 5685.52] - - [256, 5888, 1, 128] - - [237, 2178.71] + - [466, 2178.71] - - [64, 6784, 1, 256] - - [240, 5385.23] + - [469, 5385.23] - - [64, 5056, 1, 1280] - - [214, 5603.29] + - [443, 5603.29] - - [4, 6784, 1, 128] - - [252, 180.256] + - [481, 180.256] - - [2944, 2944, 1, 1280] - - [249, 9129.39] + - [478, 9129.39] - - [5888, 2368, 1, 256] - - [251, 6961.69] + - [480, 6961.69] - - [4, 3584, 1, 1280] - - [195, 646.23] + - [424, 646.23] - - [1408, 128, 1, 128] - - [181, 1172.29] + - [410, 1172.29] - - [6784, 704, 1, 3328] - - [246, 9084.62] + - [475, 9084.62] - - [128, 64, 1, 1280] - - [225, 1260.41] + - [454, 1260.41] - - [2368, 256, 1, 1280] - - [246, 6643.48] + - [475, 6643.48] - - [4, 448, 1, 3328] - - [209, 433.514] + - [438, 433.514] - - [5888, 4288, 1, 128] - - [233, 4753.17] + - [462, 4753.17] - - [4, 5888, 1, 256] - - [195, 471.14] + - [424, 471.14] - - [1408, 2944, 1, 3328] - - [249, 9207.1] + - [478, 9207.1] - - [3584, 704, 1, 128] - - [235, 3762.46] + - [464, 3762.46] - - [64, 1024, 1, 256] - - [213, 1807.99] + - [442, 1807.99] - - [5056, 5056, 1, 128] - - [236, 4830.16] + - [465, 4830.16] - - [2368, 448, 1, 1280] - - [240, 7263.16] + - [469, 7263.16] - - [128, 3584, 1, 256] - - [243, 4369.17] + - [472, 4369.17] - - [704, 448, 1, 1280] - - [241, 4205.33] + - [470, 4205.33] - - [448, 5056, 1, 128] - - [232, 3855.57] + - [461, 3855.57] - - [256, 4, 1, 1280] - - [263, 157.638] + - [492, 157.638] - - [128, 5056, 1, 256] - - [246, 6109.06] + - [475, 6109.06] - - [1408, 5056, 1, 128] - - [235, 4836.68] + - [464, 4836.68] - - [2944, 3584, 1, 128] - - [235, 4532.19] + - [464, 4532.19] - - [3584, 2368, 1, 256] - - [240, 8951.34] + - [469, 8951.34] - - [5888, 5056, 1, 1280] - - [251, 9276.49] + - [480, 9276.49] - - [2368, 5056, 1, 128] - - [235, 5167.66] + - [464, 5167.66] - - [64, 704, 1, 256] - - [195, 1501.97] + - [424, 1501.97] - - [4288, 256, 1, 1280] - - [240, 7496.3] + - [469, 7496.3] - - [3584, 3584, 1, 3328] - - [241, 9301.77] + - [470, 9301.77] - - [1024, 256, 1, 128] - - [232, 1508.84] + - [461, 1508.84] - - [4, 704, 1, 128] - - [253, 12.1469] + - [482, 12.1469] - - [5888, 6784, 1, 256] - - [239, 9370.47] + - [468, 9370.47] - - [4288, 2944, 1, 3328] - - [243, 9149.09] + - [472, 9149.09] - - [2944, 64, 1, 128] - - [179, 1456.46] + - [408, 1456.46] - - [1856, 64, 1, 256] - - [205, 2210.03] + - [434, 2210.03] - - [4288, 128, 1, 3328] - - [199, 6471.95] + - [428, 6471.95] - - [4288, 704, 1, 1280] - - [246, 8934.61] + - [475, 8934.61] - - [256, 5056, 1, 1280] - - [240, 8439.13] + - [469, 8439.13] - - [1408, 256, 1, 128] - - [235, 1769.17] + - [464, 1769.17] - - [2944, 5888, 1, 3328] - - [240, 9448.04] + - [469, 9448.04] - - [6784, 5888, 1, 1280] - - [251, 9372.25] + - [480, 9372.25] - - [704, 128, 1, 256] - - [197, 2059.8] + - [426, 2059.8] - - [5888, 4288, 1, 1280] - - [243, 9244.32] + - [472, 9244.32] - - [448, 256, 1, 1280] - - [222, 4741.72] + - [451, 4741.72] - - [5888, 3584, 1, 128] - - [231, 4980.06] + - [460, 4980.06] - - [1856, 1856, 1, 128] - - [235, 4363.98] + - [464, 4363.98] - - [5056, 4, 1, 1280] - - [255, 629.641] + - [484, 629.641] - - [256, 1408, 1, 1280] - - [246, 5588.44] + - [475, 5588.44] - - [512, 16, 1, 512] - - [206, 689.953] + - [435, 689.953] - - [704, 3584, 1, 128] - - [235, 4069.67] + - [464, 4069.67] - - [5888, 448, 1, 3328] - - [251, 7925.94] + - [480, 7925.94] - - [2368, 4288, 1, 1280] - - [250, 8492.7] + - [479, 8492.7] - - [4288, 2944, 1, 128] - - [232, 5238.21] + - [461, 5238.21] - - [1024, 6784, 1, 3328] - - [246, 8578.18] + - [475, 8578.18] - - [128, 2368, 1, 256] - - [246, 3788.9] + - [475, 3788.9] - - [6784, 64, 1, 3328] - - [240, 7003.46] + - [469, 7003.46] - - [5056, 2944, 1, 3328] - - [243, 8575.45] + - [472, 8575.45] - - [448, 128, 1, 256] - - [195, 1715.06] + - [424, 1715.06] - - [2944, 3584, 1, 256] - - [240, 8994.26] + - [469, 8994.26] - - [1408, 1408, 1, 3328] - - [238, 8757.7] + - [467, 8757.7] - - [1856, 128, 1, 1280] - - [240, 5598.17] + - [469, 5598.17] - - [3584, 3584, 1, 128] - - [231, 4787.44] + - [460, 4787.44] - - [64, 3584, 1, 256] - - [246, 3546.01] + - [475, 3546.01] - - [1408, 4, 1, 3328] - - [190, 640.24] + - [419, 640.24] - - [128, 2944, 1, 3328] - - [214, 7204.24] + - [443, 7204.24] - - [3584, 704, 1, 256] - - [240, 6239.69] + - [469, 6239.69] - - [2944, 448, 1, 3328] - - [246, 7726.71] + - [475, 7726.71] - - [3584, 1408, 1, 3328] - - [238, 9358.78] + - [467, 9358.78] - - [704, 3584, 1, 1280] - - [246, 8005.28] + - [475, 8005.28] - - [2944, 6784, 1, 1280] - - [238, 9487.73] + - [467, 9487.73] - - [1856, 6784, 1, 256] - - [240, 5684.56] + - [469, 5684.56] - - [4288, 448, 1, 3328] - - [246, 8410.38] + - [475, 8410.38] - - [6784, 4288, 1, 128] - - [236, 4785.58] + - [465, 4785.58] - - [6784, 704, 1, 1280] - - [240, 5579.05] + - [469, 5579.05] - - [256, 4288, 1, 256] - - [240, 6781.43] + - [469, 6781.43] - - [3584, 64, 1, 128] - - [179, 1474.0] + - [408, 1474.0] - - [5888, 1024, 1, 3328] - - [238, 8639.49] + - [467, 8639.49] - - [448, 64, 1, 128] - - [170, 259.282] + - [399, 259.282] - - [704, 6784, 1, 1280] - - [246, 9027.25] + - [475, 9027.25] - - [5888, 128, 1, 256] - - [246, 6812.88] + - [475, 6812.88] - - [2368, 448, 1, 3328] - - [246, 7356.63] + - [475, 7356.63] - - [1856, 5056, 1, 3328] - - [245, 8871.56] + - [474, 8871.56] - - [4, 6784, 1, 256] - - [254, 469.479] + - [483, 469.479] - - [1024, 3584, 1, 128] - - [232, 3428.02] + - [461, 3428.02] - - [1024, 1408, 1, 128] - - [235, 2935.05] + - [464, 2935.05] - - [2368, 2944, 1, 128] - - [235, 4888.02] + - [464, 4888.02] - - [5056, 64, 1, 256] - - [204, 3186.16] + - [433, 3186.16] - - [4, 448, 1, 1280] - - [209, 273.167] + - [438, 273.167] - - [5056, 2944, 1, 128] - - [236, 4752.79] + - [465, 4752.79] - - [5888, 5056, 1, 3328] - - [250, 9124.77] + - [479, 9124.77] - - [1024, 704, 1, 128] - - [235, 2302.36] + - [464, 2302.36] - - [1408, 2368, 1, 128] - - [235, 3826.95] + - [464, 3826.95] - - [5888, 2368, 1, 128] - - [232, 4912.77] + - [461, 4912.77] - - [128, 5056, 1, 3328] - - [222, 7583.8] + - [451, 7583.8] - - [3584, 6784, 1, 1280] - - [249, 9313.5] + - [478, 9313.5] - - [3072, 7435, 1, 1024] - - [243, 9322.07] + - [472, 9322.07] - - [1856, 5888, 1, 256] - - [240, 5778.34] + - [469, 5778.34] - - [256, 256, 1, 256] - - [192, 1576.91] + - [421, 1576.91] - - [256, 64, 1, 128] - - [178, 173.705] + - [407, 173.705] - - [4288, 4288, 1, 3328] - - [245, 8416.27] + - [474, 8416.27] - - [4288, 1408, 1, 1280] - - [251, 9301.97] + - [480, 9301.97] - - [3584, 5056, 1, 128] - - [237, 4344.94] + - [466, 4344.94] - - [4, 1024, 1, 3328] - - [206, 615.239] + - [435, 615.239] - - [4288, 2368, 1, 256] - - [240, 9142.67] + - [469, 9142.67] - - [2944, 5056, 1, 1280] - - [240, 9399.69] + - [469, 9399.69] - - [448, 6784, 1, 256] - - [239, 5710.93] + - [468, 5710.93] - - [64, 1024, 1, 3328] - - [222, 4975.1] + - [451, 4975.1] - - [6784, 2368, 1, 3328] - - [249, 9207.63] + - [478, 9207.63] - - [256, 1024, 1, 1280] - - [246, 5983.42] + - [475, 5983.42] - - [704, 4, 1, 128] - - [252, 15.1187] + - [481, 15.1187] - - [256, 4, 1, 256] - - [209, 52.9516] + - [438, 52.9516] - - [4288, 128, 1, 256] - - [240, 5242.98] + - [469, 5242.98] - - [4288, 1856, 1, 3328] - - [251, 9354.06] + - [480, 9354.06] - - [3584, 448, 1, 128] - - [232, 3353.9] + - [461, 3353.9] - - [256, 4, 1, 3328] - - [263, 313.324] + - [492, 313.324] - - [4, 1408, 1, 1280] - - [206, 509.207] + - [435, 509.207] - - [3584, 64, 1, 1280] - - [194, 5198.42] + - [423, 5198.42] - - [1408, 448, 1, 128] - - [232, 2628.37] + - [461, 2628.37] - - [3584, 1024, 1, 1280] - - [246, 8535.01] + - [475, 8535.01] - - [1856, 5056, 1, 256] - - [238, 8184.49] + - [467, 8184.49] - - [4, 3584, 1, 256] - - [256, 395.576] + - [485, 395.576] - - [1024, 4288, 1, 256] - - [241, 5966.52] + - [470, 5966.52] - - [5888, 3584, 1, 3328] - - [244, 9189.43] + - [473, 9189.43] - - [4, 256, 1, 256] - - [260, 41.5785] + - [489, 41.5785] - - [5056, 3584, 1, 3328] - - [245, 9431.92] + - [474, 9431.92] - - [128, 5888, 1, 1280] - - [240, 8192.1] + - [469, 8192.1] - - [704, 448, 1, 128] - - [232, 1510.96] + - [461, 1510.96] - - [2368, 1408, 1, 1280] - - [240, 8415.65] + - [469, 8415.65] - - [5056, 2944, 1, 1280] - - [251, 9294.77] + - [480, 9294.77] - - [4, 4, 1, 128] - - [253, 0.1356549] + - [482, 0.1356549] - - [3584, 256, 1, 256] - - [240, 6749.55] + - [469, 6749.55] - - [128, 1856, 1, 3328] - - [193, 6797.09] + - [422, 6797.09] - - [1024, 6784, 1, 256] - - [246, 8783.09] + - [475, 8783.09] - - [4, 128, 1, 256] - - [206, 27.4067] + - [435, 27.4067] - - [64, 64, 1, 1280] - - [225, 712.448] + - [454, 712.448] - - [6784, 4, 1, 128] - - [253, 122.06] + - [482, 122.06] - - [2944, 1408, 1, 128] - - [235, 4430.46] + - [464, 4430.46] - - [448, 128, 1, 3328] - - [222, 5097.34] + - [451, 5097.34] - - [64, 2944, 1, 3328] - - [222, 6362.2] + - [451, 6362.2] - - [64, 4288, 1, 3328] - - [222, 6565.01] + - [451, 6565.01] - - [5056, 6784, 1, 3328] - - [246, 8121.18] + - [475, 8121.18] - - [128, 2944, 1, 256] - - [240, 4692.17] + - [469, 4692.17] - - [128, 6784, 1, 128] - - [169, 2687.46] + - [398, 2687.46] - - [3584, 4288, 1, 256] - - [246, 9193.99] + - [475, 9193.99] - - [448, 1856, 1, 256] - - [246, 6231.39] + - [475, 6231.39] - - [1856, 6784, 1, 3328] - - [251, 9191.48] + - [480, 9191.48] - - [3584, 128, 1, 3328] - - [240, 7368.47] + - [469, 7368.47] - - [64, 1856, 1, 256] - - [191, 2184.63] + - [420, 2184.63] - - [1024, 448, 1, 1280] - - [246, 6977.32] + - [475, 6977.32] - - [5888, 4288, 1, 256] - - [246, 5780.5] + - [475, 5780.5] - - [4, 448, 1, 128] - - [253, 9.06] + - [482, 9.06] - - [5056, 1408, 1, 256] - - [240, 5601.35] + - [469, 5601.35] - - [64, 256, 1, 1280] - - [206, 1927.63] + - [435, 1927.63] - - [3584, 1024, 1, 256] - - [251, 7542.84] + - [480, 7542.84] - - [256, 704, 1, 256] - - [240, 2957.62] + - [469, 2957.62] - - [5888, 5888, 1, 256] - - [251, 7344.14] + - [480, 7344.14] - - [4288, 1024, 1, 1280] - - [246, 8925.84] + - [475, 8925.84] - - [5888, 128, 1, 3328] - - [240, 8410.07] + - [469, 8410.07] - - [448, 6784, 1, 3328] - - [240, 8862.56] + - [469, 8862.56] - - [2944, 1408, 1, 1280] - - [251, 7478.93] + - [480, 7478.93] - - [1024, 32, 1, 512] - - [195, 1777.35] + - [424, 1777.35] - - [2944, 1856, 1, 3328] - - [240, 9153.43] + - [469, 9153.43] - - [2368, 64, 1, 128] - - [179, 1102.3] + - [408, 1102.3] - - [2944, 2944, 1, 128] - - [231, 4591.95] + - [460, 4591.95] - - [4, 128, 1, 3328] - - [261, 119.09] + - [490, 119.09] - - [3584, 5888, 1, 1280] - - [240, 9222.49] + - [469, 9222.49] - - [64, 4, 1, 128] - - [252, 1.03516] + - [481, 1.03516] - - [6784, 1856, 1, 1280] - - [240, 9136.07] + - [469, 9136.07] - - [2944, 5056, 1, 256] - - [246, 8860.13] + - [475, 8860.13] - - [2944, 5888, 1, 1280] - - [239, 9643.63] + - [468, 9643.63] - - [5888, 256, 1, 3328] - - [246, 8799.53] + - [475, 8799.53] - - [1856, 5888, 1, 3328] - - [246, 9457.53] + - [475, 9457.53] - - [3584, 1408, 1, 256] - - [246, 8672.53] + - [475, 8672.53] - - [704, 3584, 1, 3328] - - [246, 8525.3] + - [475, 8525.3] - - [5056, 448, 1, 1280] - - [246, 8843.77] + - [475, 8843.77] - - [3584, 1856, 1, 3328] - - [238, 8881.53] + - [467, 8881.53] - - [64, 1408, 1, 128] - - [167, 747.142] + - [396, 747.142] - - [1408, 704, 1, 1280] - - [240, 8342.93] + - [469, 8342.93] - - [2944, 1024, 1, 256] - - [251, 8079.58] + - [480, 8079.58] - - [1024, 2368, 1, 128] - - [235, 3347.58] + - [464, 3347.58] - - [2368, 4288, 1, 3328] - - [246, 9467.67] + - [475, 9467.67] - - [4, 1408, 1, 256] - - [258, 257.563] + - [487, 257.563] - - [1024, 1408, 1, 1280] - - [246, 8241.84] + - [475, 8241.84] - - [64, 64, 1, 256] - - [206, 190.059] + - [435, 190.059] - - [704, 256, 1, 3328] - - [240, 4519.28] + - [469, 4519.28] - - [6784, 5056, 1, 256] - - [239, 9133.78] + - [468, 9133.78] - - [4, 4288, 1, 3328] - - [190, 670.075] + - [419, 670.075] - - [448, 6784, 1, 128] - - [232, 4481.92] + - [461, 4481.92] - - [4, 704, 1, 3328] - - [262, 523.071] + - [491, 523.071] - - [448, 2944, 1, 256] - - [240, 7022.59] + - [469, 7022.59] - - [2944, 6784, 1, 256] - - [246, 9199.84] + - [475, 9199.84] - - [2368, 2368, 1, 1280] - - [251, 8646.84] + - [480, 8646.84] - - [4, 4, 1, 1280] - - [209, 3.11176] + - [438, 3.11176] - - [1856, 3584, 1, 1280] - - [238, 8805.45] + - [467, 8805.45] - - [64, 2944, 1, 256] - - [212, 2565.76] + - [441, 2565.76] - - [3584, 1408, 1, 1280] - - [251, 9273.12] + - [480, 9273.12] - - [448, 256, 1, 128] - - [167, 941.13] + - [396, 941.13] - - [4288, 448, 1, 128] - - [233, 3215.2] + - [462, 3215.2] - - [5056, 256, 1, 1280] - - [246, 8790.13] + - [475, 8790.13] - - [1856, 1408, 1, 3328] - - [240, 9310.73] + - [469, 9310.73] - - [128, 128, 1, 128] - - [175, 155.215] + - [404, 155.215] - - [1024, 4288, 1, 3328] - - [243, 8528.12] + - [472, 8528.12] - - [448, 2368, 1, 256] - - [247, 5097.34] + - [476, 5097.34] - - [1024, 4, 1, 128] - - [253, 10.3721] + - [482, 10.3721] - - [5056, 448, 1, 256] - - [246, 8236.78] + - [475, 8236.78] - - [2944, 2368, 1, 3328] - - [239, 9331.16] + - [468, 9331.16] - - [704, 128, 1, 3328] - - [214, 5969.3] + - [443, 5969.3] - - [64, 64, 1, 3328] - - [230, 1494.78] + - [459, 1494.78] - - [1024, 1856, 1, 1280] - - [245, 6356.43] + - [474, 6356.43] - - [6784, 1856, 1, 256] - - [246, 9068.63] + - [475, 9068.63] - - [128, 2368, 1, 3328] - - [222, 6714.22] + - [451, 6714.22] - - [1024, 5888, 1, 256] - - [246, 5501.6] + - [475, 5501.6] - - [5056, 128, 1, 1280] - - [202, 6455.64] + - [431, 6455.64] - - [5056, 64, 1, 3328] - - [207, 6703.81] + - [436, 6703.81] - - [128, 704, 1, 128] - - [168, 696.618] + - [397, 696.618] - - [1408, 2368, 1, 256] - - [240, 8667.25] + - [469, 8667.25] - - [1408, 1408, 1, 256] - - [251, 7615.81] + - [480, 7615.81] - - [4, 64, 1, 128] - - [253, 1.08463] + - [482, 1.08463] - - [64, 128, 1, 1280] - - [225, 1379.81] + - [454, 1379.81] - - [2368, 2368, 1, 128] - - [235, 4582.26] + - [464, 4582.26] - - [64, 5888, 1, 128] - - [168, 2086.37] + - [397, 2086.37] - - [5888, 4, 1, 3328] - - [189, 667.514] + - [418, 667.514] - - [6784, 1408, 1, 128] - - [236, 4516.34] + - [465, 4516.34] - - [4288, 5888, 1, 256] - - [251, 8497.43] + - [480, 8497.43] - - [1408, 5056, 1, 256] - - [240, 8867.46] + - [469, 8867.46] - - [5056, 128, 1, 3328] - - [222, 7678.98] + - [451, 7678.98] - - [128, 128, 1, 1280] - - [210, 2016.59] + - [439, 2016.59] - - [448, 704, 1, 256] - - [241, 3030.89] + - [470, 3030.89] - - [4288, 3584, 1, 128] - - [232, 5246.33] + - [461, 5246.33] - - [2944, 128, 1, 3328] - - [207, 6795.16] + - [436, 6795.16] - - [128, 5056, 1, 1280] - - [193, 6193.09] + - [422, 6193.09] - - [3584, 5056, 1, 1280] - - [245, 9499.17] + - [474, 9499.17] - - [256, 448, 1, 1280] - - [201, 4267.56] + - [430, 4267.56] - - [704, 704, 1, 128] - - [235, 2259.32] + - [464, 2259.32] - - [5056, 4, 1, 128] - - [253, 12.5313] + - [482, 12.5313] - - [704, 256, 1, 1280] - - [240, 4355.97] + - [469, 4355.97] - - [64, 2368, 1, 3328] - - [214, 6310.97] + - [443, 6310.97] - - [1856, 1024, 1, 128] - - [231, 4065.43] + - [460, 4065.43] - - [1856, 64, 1, 128] - - [170, 936.329] + - [399, 936.329] - - [64, 6784, 1, 1280] - - [193, 5731.8] + - [422, 5731.8] - - [704, 4288, 1, 256] - - [246, 5218.9] + - [475, 5218.9] - - [5888, 2368, 1, 1280] - - [240, 9378.9] + - [469, 9378.9] - - [128, 256, 1, 256] - - [210, 1219.37] + - [439, 1219.37] - - [256, 64, 1, 1280] - - [212, 1820.54] + - [441, 1820.54] - - [2368, 5888, 1, 1280] - - [251, 9143.64] + - [480, 9143.64] - - [5888, 256, 1, 1280] - - [240, 8678.47] + - [469, 8678.47] - - [4, 5888, 1, 1280] - - [187, 668.242] + - [416, 668.242] - - [704, 128, 1, 128] - - [175, 649.556] + - [404, 649.556] - - [1024, 4, 1, 1280] - - [206, 478.465] + - [435, 478.465] - - [2368, 1856, 1, 3328] - - [238, 8153.87] + - [467, 8153.87] - - [2368, 128, 1, 128] - - [173, 1858.21] + - [402, 1858.21] - - [2944, 704, 1, 256] - - [240, 8438.07] + - [469, 8438.07] - - [5056, 128, 1, 128] - - [169, 2689.63] + - [398, 2689.63] - - [256, 704, 1, 3328] - - [240, 4541.18] + - [469, 4541.18] - - [704, 3584, 1, 256] - - [241, 7771.07] + - [470, 7771.07] - - [1024, 1024, 1, 1024] - - [246, 8305.62] + - [475, 8305.62] - - [704, 2944, 1, 3328] - - [246, 9166.48] + - [475, 9166.48] - - [6784, 1024, 1, 128] - - [231, 4362.31] + - [460, 4362.31] - - [256, 448, 1, 128] - - [178, 899.614] + - [407, 899.614] - - [448, 1024, 1, 3328] - - [240, 7385.56] + - [469, 7385.56] - - [2944, 1024, 1, 3328] - - [243, 8779.81] + - [472, 8779.81] - - [2944, 5056, 1, 128] - - [235, 5103.11] + - [464, 5103.11] - - [1408, 6784, 1, 256] - - [246, 8346.89] + - [475, 8346.89] - - [6784, 1408, 1, 3328] - - [242, 8878.4] + - [471, 8878.4] - - [4288, 6784, 1, 128] - - [231, 5432.99] + - [460, 5432.99] - - [704, 64, 1, 256] - - [220, 1441.89] + - [449, 1441.89] - - [5888, 4, 1, 1280] - - [257, 636.641] + - [486, 636.641] - - [256, 2368, 1, 3328] - - [240, 6804.8] + - [469, 6804.8] - - [6784, 2944, 1, 1280] - - [239, 9472.26] + - [468, 9472.26] - - [4288, 1856, 1, 128] - - [235, 4886.38] + - [464, 4886.38] - - [1856, 2944, 1, 128] - - [232, 4642.96] + - [461, 4642.96] - - [6784, 448, 1, 128] - - [232, 4369.17] + - [461, 4369.17] - - [64, 3584, 1, 128] - - [179, 1645.85] + - [408, 1645.85] - - [448, 5056, 1, 1280] - - [240, 8553.64] + - [469, 8553.64] - - [2368, 1856, 1, 128] - - [232, 4741.85] + - [461, 4741.85] - - [128, 448, 1, 1280] - - [222, 3745.01] + - [451, 3745.01] - - [4288, 704, 1, 256] - - [240, 8444.16] + - [469, 8444.16] - - [256, 3584, 1, 128] - - [232, 2454.96] + - [461, 2454.96] - - [5888, 704, 1, 256] - - [240, 8819.57] + - [469, 8819.57] - - [3584, 1024, 1, 128] - - [235, 4094.96] + - [464, 4094.96] - - [256, 5888, 1, 3328] - - [249, 8538.33] + - [478, 8538.33] - - [1408, 4288, 1, 3328] - - [251, 9212.57] + - [480, 9212.57] - - [6784, 4288, 1, 256] - - [239, 9163.12] + - [468, 9163.12] - - [4288, 256, 1, 128] - - [232, 3081.44] + - [461, 3081.44] - - [5888, 256, 1, 256] - - [240, 7680.75] + - [469, 7680.75] - - [6784, 1024, 1, 1280] - - [251, 9248.63] + - [480, 9248.63] - - [5888, 1024, 1, 128] - - [235, 4061.94] + - [464, 4061.94] - - [1024, 128, 1, 256] - - [246, 2317.39] + - [475, 2317.39] - - [128, 64, 1, 3328] - - [229, 2116.79] + - [458, 2116.79] - - [448, 64, 1, 256] - - [212, 1079.52] + - [441, 1079.52] - - [2368, 256, 1, 128] - - [233, 2229.83] + - [462, 2229.83] - - [6784, 3584, 1, 1280] - - [246, 9096.6] + - [475, 9096.6] - - [1024, 6784, 1, 1280] - - [244, 9112.9] + - [473, 9112.9] - - [2944, 64, 1, 1280] - - [202, 4983.0] + - [431, 4983.0] - - [1408, 2944, 1, 1280] - - [241, 9131.63] + - [470, 9131.63] - - [256, 1856, 1, 256] - - [249, 4432.86] + - [478, 4432.86] - - [1408, 2368, 1, 3328] - - [249, 8449.18] + - [478, 8449.18] - - [2944, 4, 1, 3328] - - [195, 673.94] + - [424, 673.94] - - [128, 1408, 1, 3328] - - [214, 6582.47] + - [443, 6582.47] - - [2944, 1856, 1, 128] - - [232, 4827.54] + - [461, 4827.54] - - [256, 2944, 1, 128] - - [235, 2416.66] + - [464, 2416.66] - - [256, 6784, 1, 128] - - [235, 3118.76] + - [464, 3118.76] - - [2368, 4, 1, 128] - - [253, 22.7197] + - [482, 22.7197] - - [1408, 256, 1, 3328] - - [240, 3733.82] + - [469, 3733.82] - - [1856, 4, 1, 128] - - [252, 7.20009] + - [481, 7.20009] - - [1024, 16, 1, 512] - - [208, 1165.18] + - [437, 1165.18] - - [5056, 6784, 1, 128] - - [236, 4949.13] + - [465, 4949.13] - - [4288, 5056, 1, 128] - - [235, 4966.9] + - [464, 4966.9] - - [1856, 5888, 1, 128] - - [231, 4351.76] + - [460, 4351.76] - - [2944, 5888, 1, 256] - - [251, 8460.99] + - [480, 8460.99] - - [3584, 1856, 1, 256] - - [246, 8876.7] + - [475, 8876.7] - - [4288, 3584, 1, 1280] - - [239, 9603.7] + - [468, 9603.7] - - [2368, 448, 1, 256] - - [240, 6604.7] + - [469, 6604.7] - - [4288, 256, 1, 3328] - - [240, 7619.89] + - [469, 7619.89] - - [1856, 704, 1, 128] - - [232, 3629.61] + - [461, 3629.61] - - [1408, 64, 1, 256] - - [196, 2168.21] + - [425, 2168.21] - - [64, 1856, 1, 128] - - [172, 979.762] + - [401, 979.762] - - [4, 256, 1, 128] - - [253, 5.23595] + - [482, 5.23595] - - [704, 4288, 1, 3328] - - [246, 9014.52] + - [475, 9014.52] - - [704, 5888, 1, 128] - - [233, 4221.77] + - [462, 4221.77] - - [6784, 3584, 1, 128] - - [231, 5360.73] + - [460, 5360.73] - - [1024, 64, 1, 256] - - [191, 1588.85] + - [420, 1588.85] - - [64, 2368, 1, 256] - - [246, 2552.55] + - [475, 2552.55] - - [4288, 5056, 1, 3328] - - [245, 8193.38] + - [474, 8193.38] - - [4, 1856, 1, 1280] - - [195, 499.192] + - [424, 499.192] - - [4288, 128, 1, 128] - - [232, 2373.57] + - [461, 2373.57] - - [1408, 1408, 1, 128] - - [235, 3753.88] + - [464, 3753.88] - - [1024, 128, 1, 3328] - - [217, 5656.32] + - [446, 5656.32] - - [1856, 128, 1, 128] - - [168, 1617.58] + - [397, 1617.58] - - [5056, 2368, 1, 256] - - [251, 5553.41] + - [480, 5553.41] - - [4288, 704, 1, 3328] - - [239, 6962.06] + - [468, 6962.06] - - [448, 3584, 1, 256] - - [249, 5981.5] + - [478, 5981.5] - - [64, 128, 1, 128] - - [186, 74.9983] + - [415, 74.9983] - - [2368, 64, 1, 1280] - - [222, 5041.33] + - [451, 5041.33] - - [2368, 1024, 1, 1280] - - [247, 7740.97] + - [476, 7740.97] - - [2944, 1408, 1, 3328] - - [249, 9204.65] + - [478, 9204.65] - - [1408, 448, 1, 256] - - [246, 5954.4] + - [475, 5954.4] - - [1024, 1408, 1, 3328] - - [243, 8161.54] + - [472, 8161.54] - - [2560, 7133, 1, 2560] - - [238, 9636.69] + - [467, 9636.69] - - [1408, 4, 1, 1280] - - [190, 520.979] + - [419, 520.979] - - [5888, 3584, 1, 256] - - [251, 9225.26] + - [480, 9225.26] - - [128, 1024, 1, 1280] - - [193, 4755.55] + - [422, 4755.55] - - [1408, 1856, 1, 3328] - - [243, 9130.87] + - [472, 9130.87] - - [4, 4, 1, 3328] - - [263, 7.03333] + - [492, 7.03333] - - [6784, 1408, 1, 1280] - - [240, 9346.91] + - [469, 9346.91] - - [4, 1024, 1, 1280] - - [190, 422.913] + - [419, 422.913] - - [704, 2944, 1, 256] - - [246, 8332.06] + - [475, 8332.06] - - [704, 4288, 1, 128] - - [232, 4371.14] + - [461, 4371.14] - - [2368, 4288, 1, 128] - - [232, 3988.89] + - [461, 3988.89] - - [64, 4288, 1, 1280] - - [222, 5407.63] + - [451, 5407.63] - - [6784, 64, 1, 1280] - - [202, 5708.25] + - [431, 5708.25] - - [3584, 128, 1, 128] - - [168, 2463.2] + - [397, 2463.2] - - [1024, 6784, 1, 128] - - [233, 3862.12] + - [462, 3862.12] - - [4, 1856, 1, 128] - - [253, 30.6362] + - [482, 30.6362] - - [1408, 64, 1, 3328] - - [222, 6095.48] + - [451, 6095.48] - - [6784, 4, 1, 256] - - [255, 487.938] + - [484, 487.938] - - [1408, 1408, 1, 1280] - - [251, 8640.63] + - [480, 8640.63] - - [256, 2368, 1, 256] - - [243, 4282.36] + - [472, 4282.36] - - [448, 4288, 1, 3328] - - [240, 8516.13] + - [469, 8516.13] - - [2368, 1408, 1, 256] - - [246, 8632.19] + - [475, 8632.19] - - [5888, 5056, 1, 128] - - [232, 5091.11] + - [461, 5091.11] - - [704, 2368, 1, 256] - - [246, 7664.8] + - [475, 7664.8] - - [2944, 448, 1, 1280] - - [246, 7618.35] + - [475, 7618.35] - - [5888, 2368, 1, 3328] - - [249, 9343.48] + - [478, 9343.48] - - [64, 2944, 1, 1280] - - [214, 5162.18] + - [443, 5162.18] - - [448, 1856, 1, 1280] - - [240, 7028.0] + - [469, 7028.0] - - [4288, 448, 1, 1280] - - [240, 5855.76] + - [469, 5855.76] - - [5888, 704, 1, 3328] - - [249, 9190.91] + - [478, 9190.91] - - [5056, 256, 1, 128] - - [235, 3235.94] + - [464, 3235.94] - - [1856, 256, 1, 128] - - [233, 1849.78] + - [462, 1849.78] - - [5056, 128, 1, 256] - - [246, 6109.06] + - [475, 6109.06] - - [704, 4, 1, 256] - - [206, 125.256] + - [435, 125.256] - - [1408, 5888, 1, 128] - - [232, 5055.16] + - [461, 5055.16] - - [4288, 4, 1, 128] - - [252, 95.7209] + - [481, 95.7209] - - [1408, 1024, 1, 256] - - [240, 7370.28] + - [469, 7370.28] - - [1024, 1856, 1, 128] - - [232, 2966.8] + - [461, 2966.8] - - [256, 704, 1, 128] - - [234, 528.229] + - [463, 528.229] - - [256, 1024, 1, 128] - - [232, 1171.69] + - [461, 1171.69] - - [448, 1024, 1, 256] - - [246, 5624.65] + - [475, 5624.65] - - [128, 4, 1, 3328] - - [263, 191.985] + - [492, 191.985] - - [5056, 6784, 1, 1280] - - [240, 9544.07] + - [469, 9544.07] - - [704, 5056, 1, 3328] - - [247, 8790.35] + - [476, 8790.35] - - [64, 1408, 1, 1280] - - [214, 4505.7] + - [443, 4505.7] - - [3584, 5056, 1, 3328] - - [245, 9073.52] + - [474, 9073.52] - - [1856, 4, 1, 3328] - - [263, 612.875] + - [492, 612.875] - - [4, 2944, 1, 128] - - [252, 72.0145] + - [481, 72.0145] - - [2368, 2944, 1, 3328] - - [238, 9314.68] + - [467, 9314.68] - - [448, 448, 1, 1280] - - [222, 5129.91] + - [451, 5129.91] - - [2368, 3584, 1, 256] - - [240, 8998.8] - - - [1024, 256, 1, 1280] - - [247, 3566.68] + - [469, 8998.8] - - [5056, 3584, 1, 1280] - - [241, 9345.17] + - [470, 9345.17] - - [448, 4, 1, 3328] - - [263, 487.337] + - [492, 487.337] - - [1856, 2944, 1, 1280] - - [251, 8438.79] + - [480, 8438.79] - - [3584, 2368, 1, 1280] - - [246, 9298.9] + - [475, 9298.9] - - [128, 1024, 1, 256] - - [198, 2356.45] + - [427, 2356.45] - - [2944, 1408, 1, 256] - - [238, 5440.82] + - [467, 5440.82] - - [4288, 1408, 1, 3328] - - [238, 9386.09] + - [467, 9386.09] - - [3584, 64, 1, 3328] - - [194, 6310.97] + - [423, 6310.97] - - [1408, 128, 1, 256] - - [240, 2942.53] + - [469, 2942.53] - - [2944, 1024, 1, 128] - - [235, 3927.99] + - [464, 3927.99] - - [4288, 5056, 1, 1280] - - [242, 8328.58] + - [471, 8328.58] - - [5888, 6784, 1, 1280] - - [251, 9757.44] + - [480, 9757.44] - - [6784, 5056, 1, 128] - - [231, 5101.4] + - [460, 5101.4] - - [256, 1024, 1, 3328] - - [240, 6475.87] + - [469, 6475.87] - - [3584, 4, 1, 256] - - [256, 420.973] + - [485, 420.973] - - [1856, 64, 1, 3328] - - [222, 6409.2] + - [451, 6409.2] - - [64, 6784, 1, 128] - - [170, 2387.32] + - [399, 2387.32] - - [5888, 1408, 1, 3328] - - [245, 9655.89] + - [474, 9655.89] - - [5888, 64, 1, 1280] - - [240, 5870.86] + - [469, 5870.86] - - [256, 5056, 1, 256] - - [243, 6109.06] + - [472, 6109.06] - - [128, 3584, 1, 128] - - [173, 2383.23] + - [402, 2383.23] - - [448, 3584, 1, 3328] - - [238, 7092.28] + - [467, 7092.28] - - [704, 2368, 1, 128] - - [232, 3741.08] + - [461, 3741.08] - - [5888, 256, 1, 128] - - [233, 2977.54] + - [462, 2977.54] - - [4, 5056, 1, 128] - - [252, 132.72] + - [481, 132.72] - - [448, 256, 1, 256] - - [204, 2308.29] + - [433, 2308.29] - - [704, 4, 1, 3328] - - [209, 552.674] + - [438, 552.674] - - [1408, 256, 1, 256] - - [240, 4577.22] + - [469, 4577.22] - - [3584, 1856, 1, 128] - - [232, 4571.86] + - [461, 4571.86] - - [4288, 4288, 1, 128] - - [235, 5284.65] + - [464, 5284.65] - - [1856, 1024, 1, 3328] - - [246, 6362.25] + - [475, 6362.25] - - [128, 5888, 1, 3328] - - [216, 7040.83] + - [445, 7040.83] - - [1024, 5056, 1, 256] - - [251, 7855.7] + - [480, 7855.7] - - [2368, 1408, 1, 3328] - - [246, 9205.66] + - [475, 9205.66] - - [5888, 448, 1, 256] - - [243, 5538.84] + - [472, 5538.84] - - [5888, 6784, 1, 128] - - [231, 4500.85] + - [460, 4500.85] - - [2368, 4, 1, 3328] - - [209, 642.898] + - [438, 642.898] - - [6784, 5056, 1, 1280] - - [247, 9249.23] + - [476, 9249.23] - - [5056, 704, 1, 1280] - - [246, 8883.37] + - [475, 8883.37] - - [1408, 256, 1, 1280] - - [240, 5632.1] + - [469, 5632.1] - - [4288, 6784, 1, 1280] - - [246, 8843.31] + - [475, 8843.31] - - [128, 704, 1, 256] - - [204, 2045.19] + - [433, 2045.19] - - [448, 128, 1, 1280] - - [214, 3807.17] + - [443, 3807.17] - - [6784, 4, 1, 3328] - - [257, 684.671] + - [486, 684.671] - - [4288, 4, 1, 1280] - - [206, 601.925] + - [435, 601.925] - - [1024, 64, 1, 3328] - - [218, 3928.48] + - [447, 3928.48] - - [1856, 4, 1, 256] - - [256, 293.394] + - [485, 293.394] - - [64, 3584, 1, 1280] - - [240, 5265.55] + - [469, 5265.55] - - [6784, 1408, 1, 256] - - [240, 9059.36] + - [469, 9059.36] - - [3584, 5888, 1, 128] - - [232, 5084.29] + - [461, 5084.29] - - [5056, 5888, 1, 256] - - [251, 8590.09] + - [480, 8590.09] - - [2368, 1024, 1, 256] - - [243, 4493.13] + - [472, 4493.13] - - [2944, 1856, 1, 256] - - [249, 5202.41] + - [478, 5202.41] - - [1856, 6784, 1, 1280] - - [247, 9071.48] + - [476, 9071.48] - - [64, 5056, 1, 128] - - [170, 2038.42] + - [399, 2038.42] - - [5888, 64, 1, 128] - - [169, 2016.59] + - [398, 2016.59] - - [448, 704, 1, 128] - - [233, 1173.65] + - [462, 1173.65] - - [4, 1024, 1, 128] - - [252, 8.89685] + - [481, 8.89685] - - [4288, 3584, 1, 256] - - [246, 9080.26] + - [475, 9080.26] - - [1408, 704, 1, 128] - - [232, 3165.71] + - [461, 3165.71] - - [64, 256, 1, 3328] - - [226, 3126.59] + - [455, 3126.59] - - [5056, 1856, 1, 1280] - - [243, 8857.55] + - [472, 8857.55] - - [1408, 1024, 1, 3328] - - [249, 8177.12] + - [478, 8177.12] - - [2368, 256, 1, 3328] - - [240, 6810.31] + - [469, 6810.31] - - [5888, 3584, 1, 1280] - - [238, 9535.55] + - [467, 9535.55] - - [1856, 3584, 1, 3328] - - [240, 9281.91] + - [469, 9281.91] - - [5888, 128, 1, 1280] - - [246, 8136.82] + - [475, 8136.82] - - [1024, 2944, 1, 256] - - [238, 7247.96] + - [467, 7247.96] - - [448, 6784, 1, 1280] - - [246, 7014.04] + - [475, 7014.04] - - [256, 3584, 1, 1280] - - [240, 7738.64] + - [469, 7738.64] - - [448, 128, 1, 128] - - [170, 496.048] + - [399, 496.048] - - [704, 5056, 1, 256] - - [246, 8609.44] + - [475, 8609.44] - - [3584, 1024, 1, 3328] - - [239, 7765.73] + - [468, 7765.73] - - [2944, 1856, 1, 1280] - - [251, 7776.03] + - [480, 7776.03] - - [128, 256, 1, 128] - - [183, 296.308] + - [412, 296.308] - - [5056, 256, 1, 256] - - [240, 7829.73] + - [469, 7829.73] - - [2368, 3584, 1, 3328] - - [239, 8896.08] + - [468, 8896.08] - - [2944, 704, 1, 1280] - - [249, 6855.83] + - [478, 6855.83] - - [128, 4, 1, 256] - - [258, 24.9242] + - [487, 24.9242] - - [2944, 3584, 1, 1280] - - [251, 9049.22] + - [480, 9049.22] - - [1856, 5888, 1, 1280] - - [246, 9432.06] + - [475, 9432.06] - - [256, 256, 1, 1280] - - [211, 3942.12] + - [440, 3942.12] - - [5056, 448, 1, 3328] - - [251, 4587.83] + - [480, 4587.83] - - [4288, 1408, 1, 256] - - [251, 5408.83] + - [480, 5408.83] - - [3584, 64, 1, 256] - - [220, 2496.71] + - [449, 2496.71] - - [64, 1856, 1, 3328] - - [193, 5896.78] + - [422, 5896.78] - - [256, 1408, 1, 128] - - [232, 1643.17] + - [461, 1643.17] - - [5888, 1408, 1, 128] - - [231, 4436.37] + - [460, 4436.37] - - [4288, 2368, 1, 1280] - - [240, 9433.04] + - [469, 9433.04] - - [4, 4288, 1, 256] - - [255, 442.732] + - [484, 442.732] - - [256, 4288, 1, 128] - - [232, 2814.79] + - [461, 2814.79] - - [256, 128, 1, 3328] - - [221, 3951.26] + - [450, 3951.26] - - [6784, 2368, 1, 256] - - [240, 9169.99] + - [469, 9169.99] - - [5888, 128, 1, 128] - - [169, 3156.81] + - [398, 3156.81] - - [4288, 1856, 1, 256] - - [246, 5658.23] + - [475, 5658.23] - - [1856, 256, 1, 3328] - - [240, 7646.37] + - [469, 7646.37] - - [1856, 2944, 1, 256] - - [247, 6444.98] + - [476, 6444.98] - - [5056, 1024, 1, 128] - - [231, 4607.3] + - [460, 4607.3] - - [64, 5888, 1, 1280] - - [246, 5842.46] + - [475, 5842.46] - - [1760, 7133, 1, 1760] - - [239, 9097.84] + - [468, 9097.84] - - [6784, 256, 1, 128] - - [232, 3685.41] + - [461, 3685.41] - - [5888, 704, 1, 128] - - [231, 3656.23] + - [460, 3656.23] - - [6784, 64, 1, 128] - - [182, 2191.52] + - [411, 2191.52] - - [1024, 4288, 1, 1280] - - [246, 9199.32] + - [475, 9199.32] - - [2368, 5056, 1, 3328] - - [242, 9072.88] + - [471, 9072.88] - - [448, 4, 1, 128] - - [253, 5.42937] + - [482, 5.42937] - - [4, 256, 1, 3328] - - [263, 311.037] + - [492, 311.037] - - [4288, 1024, 1, 3328] - - [244, 8660.33] + - [473, 8660.33] - - [1024, 5056, 1, 3328] - - [240, 8886.76] + - [469, 8886.76] - - [1024, 1856, 1, 3328] - - [245, 8426.24] + - [474, 8426.24] - - [704, 704, 1, 1280] - - [240, 7661.8] + - [469, 7661.8] - - [128, 2368, 1, 1280] - - [214, 5746.15] + - [443, 5746.15] - - [1408, 128, 1, 3328] - - [222, 6530.87] + - [451, 6530.87] - - [3584, 256, 1, 1280] - - [246, 7634.04] + - [475, 7634.04] - - [4, 128, 1, 128] - - [253, 2.07874] + - [482, 2.07874] - - [704, 6784, 1, 128] - - [235, 4589.59] + - [464, 4589.59] - - [3584, 128, 1, 1280] - - [240, 7078.24] + - [469, 7078.24] - - [4, 256, 1, 1280] - - [209, 178.187] + - [438, 178.187] - - [128, 704, 1, 3328] - - [214, 5959.81] + - [443, 5959.81] - - [4288, 6784, 1, 256] - - [240, 9326.54] + - [469, 9326.54] - - [3584, 2944, 1, 3328] - - [242, 9114.16] + - [471, 9114.16] - - [128, 1856, 1, 256] - - [246, 3672.65] + - [475, 3672.65] - - [64, 4288, 1, 256] - - [240, 3457.51] + - [469, 3457.51] - - [4, 3584, 1, 3328] - - [189, 694.37] + - [418, 694.37] - - [64, 4, 1, 3328] - - [209, 71.5738] + - [438, 71.5738] - - [4, 64, 1, 3328] - - [209, 91.9069] + - [438, 91.9069] - - [5888, 2944, 1, 256] - - [239, 7241.55] + - [468, 7241.55] - - [2368, 6784, 1, 128] - - [235, 5229.63] + - [464, 5229.63] - - [448, 4288, 1, 1280] - - [240, 8416.4] + - [469, 8416.4] - - [448, 1856, 1, 3328] - - [240, 7161.56] + - [469, 7161.56] - - [4, 1024, 1, 256] - - [206, 187.346] + - [435, 187.346] - - [5056, 4288, 1, 256] - - [251, 8947.26] + - [480, 8947.26] - - [1024, 448, 1, 256] - - [246, 5318.96] + - [475, 5318.96] - - [1024, 3584, 1, 256] - - [241, 6152.04] + - [470, 6152.04] - - [2944, 128, 1, 1280] - - [222, 6053.63] + - [451, 6053.63] - - [1856, 5056, 1, 128] - - [232, 5091.42] + - [461, 5091.42] - - [64, 256, 1, 256] - - [195, 771.112] + - [424, 771.112] - - [1408, 4, 1, 128] - - [252, 40.8758] + - [481, 40.8758] - - [128, 2368, 1, 128] - - [180, 1520.37] + - [409, 1520.37] - - [256, 704, 1, 1280] - - [240, 4329.81] + - [469, 4329.81] - - [64, 2368, 1, 128] - - [171, 1212.52] + - [400, 1212.52] - - [6784, 6784, 1, 3328] - - [251, 8310.67] + - [480, 8310.67] - - [448, 5888, 1, 1280] - - [246, 8502.33] + - [475, 8502.33] - - [5056, 448, 1, 128] - - [232, 4161.0] + - [461, 4161.0] - - [3584, 2944, 1, 128] - - [232, 4363.51] + - [461, 4363.51] - - [6784, 256, 1, 1280] - - [246, 8629.67] + - [475, 8629.67] - - [256, 2944, 1, 1280] - - [246, 7277.48] + - [475, 7277.48] - - [64, 4288, 1, 128] - - [171, 1822.06] + - [400, 1822.06] - - [2368, 5888, 1, 3328] - - [240, 9017.52] + - [469, 9017.52] - - [4, 64, 1, 256] - - [206, 16.1627] + - [435, 16.1627] - - [704, 1024, 1, 3328] - - [246, 8059.55] + - [475, 8059.55] - - [2368, 1856, 1, 1280] - - [246, 8813.24] + - [475, 8813.24] - - [128, 448, 1, 128] - - [167, 588.244] + - [396, 588.244] - - [128, 6784, 1, 256] - - [246, 6538.28] + - [475, 6538.28] - - [3584, 4288, 1, 128] - - [232, 5025.46] + - [461, 5025.46] - - [64, 448, 1, 128] - - [184, 231.793] + - [413, 231.793] - - [5888, 4288, 1, 3328] - - [240, 9515.88] + - [469, 9515.88] - - [2368, 704, 1, 256] - - [246, 7642.84] + - [475, 7642.84] - - [256, 1856, 1, 3328] - - [246, 6547.17] + - [475, 6547.17] - - [1856, 128, 1, 256] - - [240, 3782.28] + - [469, 3782.28] - - [6784, 128, 1, 128] - - [174, 2835.54] + - [403, 2835.54] - - [3584, 1408, 1, 128] - - [231, 3049.21] + - [460, 3049.21] - - [1856, 5056, 1, 1280] - - [247, 8863.3] + - [476, 8863.3] - - [2944, 1024, 1, 1280] - - [251, 8873.25] + - [480, 8873.25] - - [5056, 4, 1, 256] - - [187, 494.121] + - [416, 494.121] - - [3584, 5888, 1, 3328] - - [239, 9585.25] + - [468, 9585.25] - - [2368, 4288, 1, 256] - - [251, 6419.05] + - [480, 6419.05] - - [1024, 2368, 1, 3328] - - [246, 8645.36] + - [475, 8645.36] - - [64, 704, 1, 3328] - - [228, 4399.93] + - [457, 4399.93] - - [704, 1408, 1, 256] - - [240, 7428.54] + - [469, 7428.54] - - [6784, 1856, 1, 3328] - - [251, 9163.66] + - [480, 9163.66] - - [1024, 2944, 1, 128] - - [235, 3551.98] + - [464, 3551.98] - - [1024, 3584, 1, 1280] - - [249, 9112.47] + - [478, 9112.47] - - [4288, 5888, 1, 3328] - - [239, 8524.05] + - [468, 8524.05] - - [4288, 4, 1, 3328] - - [206, 620.016] + - [435, 620.016] - - [256, 1408, 1, 256] - - [240, 4505.7] + - [469, 4505.7] - - [448, 2944, 1, 1280] - - [240, 7612.87] + - [469, 7612.87] - - [4, 5888, 1, 128] - - [252, 174.564] + - [481, 174.564] - - [1024, 2944, 1, 3328] - - [245, 9136.74] + - [474, 9136.74] - - [3584, 6784, 1, 256] - - [245, 7253.89] + - [474, 7253.89] - - [256, 6784, 1, 1280] - - [240, 8637.72] + - [469, 8637.72] - - [1856, 3584, 1, 256] - - [246, 8199.67] + - [475, 8199.67] - - [128, 448, 1, 3328] - - [227, 4799.92] + - [456, 4799.92] - - [6784, 1856, 1, 128] - - [232, 5185.62] + - [461, 5185.62] - - [4, 448, 1, 256] - - [206, 86.9848] + - [435, 86.9848] - - [2944, 704, 1, 128] - - [235, 3798.64] + - [464, 3798.64] - - [256, 5888, 1, 1280] - - [240, 8678.47] + - [469, 8678.47] - - [4, 128, 1, 1280] - - [209, 102.5] + - [438, 102.5] - - [4288, 6784, 1, 3328] - - [245, 8209.4] + - [474, 8209.4] - - [6784, 128, 1, 1280] - - [222, 6562.99] + - [451, 6562.99] - - [64, 1408, 1, 256] - - [212, 2059.8] + - [441, 2059.8] - - [7680, 5481, 1, 2560] - - [251, 9426.79] + - [480, 9426.79] - - [2368, 1408, 1, 128] - - [232, 4532.5] + - [461, 4532.5] - - [1856, 448, 1, 256] - - [240, 6275.48] + - [469, 6275.48] - - [1408, 1024, 1, 128] - - [232, 3604.58] + - [461, 3604.58] - - [128, 64, 1, 128] - - [167, 87.4813] + - [396, 87.4813] - - [6784, 3584, 1, 3328] - - [247, 8991.92] + - [476, 8991.92] - - [2944, 64, 1, 3328] - - [216, 6043.36] + - [445, 6043.36] - - [64, 64, 1, 128] - - [172, 36.309] + - [401, 36.309] - - [2368, 5056, 1, 1280] - - [246, 9438.48] + - [475, 9438.48] - - [64, 4, 1, 1280] - - [209, 40.2569] + - [438, 40.2569] - - [1408, 2368, 1, 1280] - - [242, 7738.16] + - [471, 7738.16] - - [128, 1408, 1, 1280] - - [214, 4937.74] + - [443, 4937.74] - - [256, 64, 1, 3328] - - [224, 2683.46] + - [453, 2683.46] - - [2944, 4288, 1, 128] - - [232, 5173.81] + - [461, 5173.81] - - [2944, 2944, 1, 256] - - [240, 8943.92] + - [469, 8943.92] - - [2944, 4, 1, 1280] - - [189, 617.857] + - [418, 617.857] - - [5888, 4, 1, 256] - - [255, 483.218] + - [484, 483.218] - - [6784, 256, 1, 256] - - [246, 7916.7] + - [475, 7916.7] - - [256, 5056, 1, 3328] - - [240, 8953.25] + - [469, 8953.25] - - [128, 4288, 1, 1280] - - [193, 6015.05] + - [422, 6015.05] - - [5056, 1856, 1, 128] - - [234, 4221.15] + - [463, 4221.15] - - [5888, 1408, 1, 256] - - [245, 9144.85] + - [474, 9144.85] - - [128, 128, 1, 256] - - [195, 759.938] + - [424, 759.938] - - [5056, 4, 1, 3328] - - [255, 642.818] + - [484, 642.818] - - [4288, 3584, 1, 3328] - - [241, 9300.05] + - [470, 9300.05] - - [448, 704, 1, 3328] - - [247, 4481.08] + - [476, 4481.08] - - [448, 448, 1, 128] - - [171, 1360.81] + - [400, 1360.81] - - [1024, 2368, 1, 1280] - - [240, 8570.29] + - [469, 8570.29] - - [1856, 704, 1, 3328] - - [240, 8448.26] + - [469, 8448.26] - - [4, 2368, 1, 128] - - [252, 64.5902] + - [481, 64.5902] - - [5888, 6784, 1, 3328] - - [247, 9447.12] + - [476, 9447.12] - - [704, 4288, 1, 1280] - - [249, 7476.87] + - [478, 7476.87] - - [704, 256, 1, 256] - - [240, 2957.62] + - [469, 2957.62] - - [6784, 448, 1, 3328] - - [243, 8886.22] + - [472, 8886.22] - - [4288, 1024, 1, 128] - - [231, 3864.49] + - [460, 3864.49] - - [49, 512, 128, 2048] - - [274, 7112.78] + - [503, 7112.78] - - [196, 256, 256, 1024] - - [268, 8302.7] + - [497, 8302.7] - - [784, 512, 256, 128] - - [266, 9061.36] + - [495, 9061.36] - - [49, 2048, 128, 512] - - [264, 6963.36] - - - [784, 512, 64, 128] - - [266, 8822.62] + - [493, 6963.36] - - [784, 128, 128, 512] - - [273, 8983.63] + - [502, 8983.63] - - [196, 256, 64, 1024] - - [272, 7823.5] + - [501, 7823.5] - - [3136, 256, 256, 64] - - [269, 9051.38] + - [498, 9051.38] - - [3136, 64, 128, 64] - - [265, 8581.35] + - [494, 8581.35] - - [49, 2048, 256, 512] - - [264, 7049.64] - - - [196, 1024, 64, 256] - - [267, 7953.69] + - [493, 7049.64] - - [784, 128, 256, 512] - - [275, 9102.99] + - [504, 9102.99] - - [196, 256, 128, 1024] - - [267, 8085.89] - - - [3136, 64, 64, 256] - - [271, 9266.13] - - - [784, 128, 64, 512] - - [272, 8809.39] - - - [49, 2048, 64, 512] - - [264, 6843.95] + - [496, 8085.89] - - [3136, 64, 128, 256] - - [271, 9381.39] + - [500, 9381.39] - - [3136, 256, 128, 64] - - [269, 8982.64] + - [498, 8982.64] - - [784, 512, 128, 128] - - [266, 8965.99] - - - [3136, 256, 64, 64] - - [269, 8879.8] + - [495, 8965.99] - - [3136, 64, 256, 256] - - [271, 9566.43] - - - [3136, 64, 64, 64] - - [270, 8314.05] + - [500, 9566.43] - - [3136, 64, 256, 64] - - [265, 8743.8] + - [494, 8743.8] - - [196, 1024, 128, 256] - - [268, 8119.43] - - - [49, 512, 64, 2048] - - [276, 7055.41] + - [497, 8119.43] - - [49, 512, 256, 2048] - - [277, 7166.41] + - [506, 7166.41] - - [196, 1024, 256, 256] - - [268, 8210.66] + - [497, 8210.66] - - [5329, 160, 64, 64] - - [284, 8156.89] + - [513, 8156.89] - - [1225, 288, 64, 48] - - [288, 6926.23] + - [517, 6926.23] - - [1225, 192, 64, 64] - - [290, 7840.1] + - [519, 7840.1] - - [64, 1280, 64, 384] - - [291, 9276.11] + - [520, 9276.11] - - [1225, 384, 64, 192] - - [281, 9162.35] + - [510, 9162.35] - - [1225, 288, 64, 64] - - [282, 7495.27] + - [511, 7495.27] - - [5329, 64, 64, 80] - - [283, 8480.13] + - [512, 8480.13] - - [289, 1024, 64, 256] - - [281, 8483.83] + - [510, 8483.83] - - [289, 768, 64, 192] - - [287, 8234.84] + - [516, 8234.84] - - [289, 768, 64, 128] - - [287, 7988.81] + - [516, 7988.81] - - [64, 1536, 64, 384] - - [291, 9323.65] + - [520, 9323.65] - - [1225, 384, 64, 64] - - [290, 8158.8] + - [519, 8158.8] - - [64, 2048, 64, 192] - - [287, 8818.61] + - [516, 8818.61] - - [64, 1280, 64, 320] - - [283, 9202.17] + - [512, 9202.17] - - [1225, 384, 64, 96] - - [281, 8540.7] + - [510, 8540.7] - - [64, 1280, 64, 448] - - [287, 9317.82] + - [516, 9317.82] - - [289, 768, 64, 160] - - [291, 8128.81] + - [520, 8128.81] - - [1225, 192, 64, 32] - - [290, 6495.37] + - [519, 6495.37] - - [64, 1536, 64, 256] - - [287, 9143.0] + - [516, 9143.0] - - [1225, 256, 64, 48] - - [285, 7545.36] + - [514, 7545.36] - - [1225, 256, 64, 64] - - [286, 7972.45] + - [515, 7972.45] - - [1225, 192, 64, 48] - - [289, 7348.9] + - [518, 7348.9] - - [289, 1024, 64, 384] - - [279, 8725.66] + - [508, 8725.66] - - [289, 1024, 64, 192] - - [281, 8313.16] + - [510, 8313.16] - - [64, 1280, 64, 192] - - [283, 8768.68] + - [512, 8768.68] - - [64, 2048, 64, 320] - - [280, 9147.98] + - [509, 9147.98] - - [64, 2048, 64, 448] - - [278, 9304.16] + - [507, 9304.16] - - [64, 2048, 64, 384] - - [280, 9235.28] + - [509, 9235.28] - - [289, 1024, 64, 128] - - [287, 7989.51] + - [516, 7989.51] - - [4096, 1024, 1, 2984] - - [326, 9846.39] + - [555, 9846.39] - - [1024, 4096, 1, 3437] - - [327, 9915.8] + - [556, 9915.8] - - [1024, 4096, 1, 3235] - - [320, 9914.02] + - [549, 9914.02] - - [4096, 1024, 1, 4032] - - [326, 9926.06] + - [555, 9926.06] - - [1024, 4096, 1, 3334] - - [327, 9918.27] + - [556, 9918.27] - - [4096, 1024, 1, 3288] - - [327, 9854.67] + - [556, 9854.67] - - [1024, 4096, 1, 3515] - - [327, 9924.03] + - [556, 9924.03] - - [4096, 1024, 1, 3437] - - [327, 9869.63] + - [556, 9869.63] - - [1024, 4096, 1, 3259] - - [327, 9907.65] + - [556, 9907.65] - - [1024, 4096, 1, 3384] - - [319, 9921.21] + - [548, 9921.21] - - [64, 92, 688, 92] - - [297, 6137.89] + - [526, 6137.89] - - [4096, 1024, 1, 3458] - - [326, 9887.69] + - [555, 9887.69] - - [1024, 4096, 1, 3412] - - [326, 9930.56] + - [555, 9930.56] - - [1024, 4096, 1, 3529] - - [320, 9924.54] + - [549, 9924.54] - - [1024, 4096, 1, 4032] - - [327, 9963.48] + - [556, 9963.48] - - [4096, 1024, 1, 3999] - - [327, 9895.0] + - [556, 9895.0] - - [1024, 4096, 1, 3079] - - [320, 9894.58] + - [549, 9894.58] - - [1024, 4096, 1, 3876] - - [319, 9949.39] + - [548, 9949.39] - - [1024, 4096, 1, 3450] - - [327, 9915.65] + - [556, 9915.65] - - [1024, 4096, 1, 3256] - - [327, 9911.18] + - [556, 9911.18] - - [4096, 1024, 1, 3403] - - [326, 9858.93] + - [555, 9858.93] - - [1024, 1024, 1, 3975] - - [317, 8990.81] + - [546, 8990.81] - - [1024, 4096, 1, 3359] - - [327, 9915.0] + - [556, 9915.0] - - [4096, 1024, 1, 3549] - - [326, 9870.66] + - [555, 9870.66] - - [4096, 1024, 1, 3176] - - [327, 9855.92] + - [556, 9855.92] - - [1024, 4096, 1, 3504] - - [319, 9934.17] + - [548, 9934.17] - - [4096, 1024, 1, 3314] - - [326, 9873.9] + - [555, 9873.9] - - [4096, 1024, 1, 3183] - - [326, 9843.84] + - [555, 9843.84] - - [1024, 4096, 1, 3209] - - [320, 9904.97] + - [549, 9904.97] - - [1024, 4096, 1, 3720] - - [319, 9934.16] + - [548, 9934.16] - - [1024, 4096, 1, 3859] - - [319, 9952.53] + - [548, 9952.53] - - [1024, 33708, 1, 4059] - - [319, 10321.5] + - [548, 10321.5] - - [1024, 4096, 1, 3968] - - [319, 9955.96] + - [548, 9955.96] - - [64, 123, 528, 123] - - [292, 6916.21] + - [521, 6916.21] - - [4096, 1024, 1, 3477] - - [327, 9872.03] + - [556, 9872.03] - - [4096, 1024, 1, 3233] - - [327, 9862.35] + - [556, 9862.35] - - [4096, 1024, 1, 3409] - - [327, 9876.86] + - [556, 9876.86] - - [4096, 1024, 1, 3564] - - [327, 9870.49] + - [556, 9870.49] - - [64, 102, 624, 100] - - [292, 5773.16] + - [521, 5773.16] - - [4096, 1024, 1, 3190] - - [326, 9850.97] + - [555, 9850.97] - - [64, 112, 576, 111] - - [292, 6517.35] + - [521, 6517.35] - - [1024, 4096, 1, 3288] - - [326, 9911.9] + - [555, 9911.9] - - [4096, 1024, 1, 3451] - - [326, 9859.61] + - [555, 9859.61] - - [1024, 4096, 1, 3348] - - [319, 9915.47] + - [548, 9915.47] - - [64, 102, 624, 102] - - [292, 5783.7] + - [521, 5783.7] - - [1024, 4096, 1, 3465] - - [320, 9913.12] + - [549, 9913.12] - - [1024, 33708, 1, 4032] - - [319, 10340.4] + - [548, 10340.4] - - [1024, 33708, 1, 3840] - - [319, 10341.8] + - [548, 10341.8] - - [4096, 1024, 1, 3391] - - [327, 9861.77] + - [556, 9861.77] - - [1024, 4096, 1, 3530] - - [319, 9920.44] + - [548, 9920.44] - - [4096, 1024, 1, 3209] - - [326, 9847.0] + - [555, 9847.0] - - [1024, 4096, 1, 3457] - - [320, 9917.29] + - [549, 9917.29] - - [1024, 4096, 1, 3386] - - [319, 9917.65] + - [548, 9917.65] - - [4096, 1024, 1, 3350] - - [326, 9884.54] + - [555, 9884.54] - - [1024, 4096, 1, 3184] - - [327, 9925.98] + - [556, 9925.98] - - [1024, 4096, 1, 3093] - - [326, 9902.55] + - [555, 9902.55] - - [64, 133, 480, 135] - - [309, 6205.97] + - [538, 6205.97] - - [1024, 4096, 1, 3400] - - [319, 9917.1] + - [548, 9917.1] - - [1024, 1024, 1, 4026] - - [325, 9014.39] + - [554, 9014.39] - - [1024, 4096, 1, 3214] - - [319, 9895.94] + - [548, 9895.94] - - [4096, 1024, 1, 3406] - - [327, 9857.82] + - [556, 9857.82] - - [1024, 4096, 1, 3565] - - [326, 9919.37] + - [555, 9919.37] - - [4096, 1024, 1, 3536] - - [327, 9889.06] + - [556, 9889.06] - - [1024, 4096, 1, 3183] - - [326, 9907.55] + - [555, 9907.55] - - [1024, 4096, 1, 3462] - - [327, 9922.4] + - [556, 9922.4] - - [4096, 1024, 1, 3130] - - [320, 9846.04] + - [549, 9846.04] - - [4096, 1024, 1, 3381] - - [327, 9868.27] + - [556, 9868.27] - - [4096, 1024, 1, 3298] - - [326, 9870.54] + - [555, 9870.54] - - [1024, 4096, 1, 3292] - - [319, 9906.3] + - [548, 9906.3] - - [4096, 1024, 1, 3289] - - [326, 9856.55] + - [555, 9856.55] - - [64, 160, 400, 159] - - [312, 7427.84] + - [541, 7427.84] - - [1024, 4096, 1, 3379] - - [319, 9917.09] + - [548, 9917.09] - - [1024, 4096, 1, 3990] - - [320, 9947.37] + - [549, 9947.37] - - [1024, 4096, 1, 3540] - - [327, 9935.76] + - [556, 9935.76] - - [4096, 1024, 1, 3412] - - [327, 9867.56] + - [556, 9867.56] - - [1024, 1024, 1, 3780] - - [322, 9036.26] + - [551, 9036.26] - - [1024, 4096, 1, 3555] - - [326, 9927.37] + - [555, 9927.37] - - [1024, 4096, 1, 3518] - - [320, 9925.55] + - [549, 9925.55] - - [4096, 1024, 1, 3189] - - [326, 9861.24] + - [555, 9861.24] - - [1024, 4096, 1, 3298] - - [320, 9923.22] + - [549, 9923.22] - - [4096, 1024, 1, 3072] - - [326, 9872.08] + - [555, 9872.08] - - [1024, 4096, 1, 3393] - - [327, 9929.28] + - [556, 9929.28] - - [1024, 4096, 1, 3207] - - [319, 9912.81] + - [548, 9912.81] - - [64, 228, 272, 232] - - [315, 7350.14] + - [544, 7350.14] - - [64, 23, 2720, 23] - - [296, 2640.25] + - [525, 2640.25] - - [4096, 1024, 1, 3487] - - [327, 9860.91] + - [556, 9860.91] - - [1024, 1024, 1, 3822] - - [325, 8993.96] + - [554, 8993.96] - - [64, 77, 816, 77] - - [297, 5273.19] + - [526, 5273.19] - - [4096, 1024, 1, 3431] - - [327, 9867.53] + - [556, 9867.53] - - [4096, 1024, 1, 3378] - - [326, 9888.14] + - [555, 9888.14] - - [4096, 1024, 1, 3529] - - [320, 9879.5] + - [549, 9879.5] - - [4096, 1024, 1, 3460] - - [327, 9877.25] + - [556, 9877.25] - - [1024, 4096, 1, 3336] - - [319, 9912.41] + - [548, 9912.41] - - [1024, 4096, 1, 3501] - - [320, 9914.4] + - [549, 9914.4] - - [64, 159, 400, 159] - - [310, 7016.51] + - [539, 7016.51] - - [1024, 4096, 1, 3584] - - [327, 9940.59] + - [556, 9940.59] - - [64, 135, 480, 134] - - [310, 6241.39] + - [539, 6241.39] - - [64, 99, 624, 99] - - [301, 5617.39] + - [530, 5617.39] - - [4096, 1024, 1, 2499] - - [326, 9813.57] + - [555, 9813.57] - - [1024, 1024, 1, 3942] - - [322, 9060.01] + - [551, 9060.01] - - [4096, 1024, 1, 3352] - - [326, 9867.12] + - [555, 9867.12] - - [1024, 4096, 1, 3543] - - [327, 9928.77] + - [556, 9928.77] - - [1024, 4096, 1, 3476] - - [326, 9931.58] + - [555, 9931.58] - - [1024, 33708, 1, 3822] - - [319, 10324.7] + - [548, 10324.7] - - [1024, 4096, 1, 3436] - - [319, 9917.28] + - [548, 9917.28] - - [1024, 1024, 1, 3861] - - [318, 8998.49] + - [547, 8998.49] - - [1024, 1024, 1, 4000] - - [323, 9058.3] + - [552, 9058.3] - - [1024, 4096, 1, 3594] - - [319, 9927.88] + - [548, 9927.88] - - [4096, 1024, 1, 3514] - - [327, 9872.3] + - [556, 9872.3] - - [1024, 4096, 1, 3064] - - [326, 9907.1] + - [555, 9907.1] - - [4096, 1024, 1, 3371] - - [319, 9857.74] + - [548, 9857.74] - - [4096, 1024, 1, 3558] - - [327, 9876.31] + - [556, 9876.31] - - [4096, 1024, 1, 3517] - - [326, 9866.45] + - [555, 9866.45] - - [4096, 1024, 1, 3144] - - [326, 9846.36] + - [555, 9846.36] - - [1024, 4096, 1, 3312] - - [319, 9932.85] + - [548, 9932.85] - - [4096, 1024, 1, 3079] - - [326, 9851.1] + - [555, 9851.1] - - [1024, 4096, 1, 3415] - - [319, 9919.47] + - [548, 9919.47] - - [1024, 4096, 1, 3221] - - [326, 9908.18] + - [555, 9908.18] - - [1024, 4096, 1, 3978] - - [320, 9944.41] + - [549, 9944.41] - - [4096, 1024, 1, 3876] - - [326, 9898.99] + - [555, 9898.99] - - [1024, 4096, 1, 3528] - - [319, 9919.6] + - [548, 9919.6] - - [1024, 4096, 1, 3181] - - [327, 9894.86] + - [556, 9894.86] - - [4096, 1024, 1, 3445] - - [326, 9878.54] + - [555, 9878.54] - - [4096, 1024, 1, 3450] - - [319, 9864.82] + - [548, 9864.82] - - [4096, 1024, 1, 3377] - - [326, 9879.69] + - [555, 9879.69] - - [1024, 4096, 1, 3532] - - [320, 9928.19] + - [549, 9928.19] - - [1024, 33708, 1, 3944] - - [319, 10329.7] + - [548, 10329.7] - - [4096, 1024, 1, 3483] - - [326, 9861.83] + - [555, 9861.83] - - [1024, 4096, 1, 3358] - - [319, 9903.69] + - [548, 9903.69] - - [4096, 1024, 1, 3464] - - [326, 9876.84] + - [555, 9876.84] - - [4096, 1024, 1, 3282] - - [319, 9859.23] + - [548, 9859.23] - - [4096, 1024, 1, 3256] - - [327, 9855.1] + - [556, 9855.1] - - [1024, 4096, 1, 3057] - - [326, 9910.75] + - [555, 9910.75] - - [4096, 1024, 1, 3481] - - [326, 9866.29] + - [555, 9866.29] - - [4096, 1024, 1, 3340] - - [326, 9862.25] + - [555, 9862.25] - - [1024, 1024, 1, 3870] - - [325, 9082.45] + - [554, 9082.45] - - [1024, 4096, 1, 3273] - - [319, 9916.29] + - [548, 9916.29] - - [64, 65, 992, 65] - - [310, 4683.01] + - [539, 4683.01] - - [4096, 1024, 1, 3392] - - [320, 9881.12] + - [549, 9881.12] - - [4096, 1024, 1, 3337] - - [326, 9864.5] + - [555, 9864.5] - - [4096, 1024, 1, 3359] - - [326, 9874.42] + - [555, 9874.42] - - [4096, 1024, 1, 3498] - - [327, 9864.35] + - [556, 9864.35] - - [4096, 1024, 1, 3169] - - [326, 9851.1] + - [555, 9851.1] - - [1024, 33708, 1, 3859] - - [320, 10332.6] + - [549, 10332.6] - - [64, 19, 3264, 19] - - [296, 2182.14] + - [525, 2182.14] - - [1024, 4096, 1, 3103] - - [319, 9898.9] + - [548, 9898.9] - - [4096, 1024, 1, 3900] - - [326, 9897.12] + - [555, 9897.12] - - [1024, 4096, 1, 3442] - - [326, 9938.97] + - [555, 9938.97] - - [1024, 4096, 1, 3248] - - [326, 9939.92] + - [555, 9939.92] - - [1024, 4096, 1, 3351] - - [327, 9923.23] + - [556, 9923.23] - - [4096, 1024, 1, 3593] - - [326, 9894.36] + - [555, 9894.36] - - [1024, 4096, 1, 3780] - - [326, 9941.96] + - [555, 9941.96] - - [64, 133, 480, 133] - - [310, 6180.79] + - [539, 6180.79] - - [1024, 33708, 1, 3681] - - [319, 10332.3] + - [548, 10332.3] - - [4096, 1024, 1, 3374] - - [320, 9859.36] + - [549, 9859.36] - - [1024, 4096, 1, 3557] - - [319, 9928.2] + - [548, 9928.2] - - [4096, 1024, 1, 3906] - - [326, 9907.07] + - [555, 9907.07] - - [4096, 1024, 1, 3504] - - [326, 9886.05] + - [555, 9886.05] - - [1024, 4096, 1, 3270] - - [326, 9916.37] + - [555, 9916.37] - - [4096, 1024, 1, 3098] - - [319, 9854.76] + - [548, 9854.76] - - [64, 232, 272, 232] - - [315, 7394.1] + - [544, 7394.1] - - [4096, 1024, 1, 3216] - - [327, 9876.57] + - [556, 9876.57] - - [64, 148, 432, 148] - - [312, 6663.85] + - [541, 6663.85] - - [1024, 4096, 1, 3550] - - [326, 9920.28] + - [555, 9920.28] - - [4096, 1024, 1, 3449] - - [320, 9870.57] + - [549, 9870.57] - - [1024, 4096, 1, 3403] - - [327, 9908.21] + - [556, 9908.21] - - [1024, 4096, 1, 3523] - - [326, 9932.71] + - [555, 9932.71] - - [1024, 4096, 1, 3486] - - [326, 9917.46] + - [555, 9917.46] - - [1024, 4096, 1, 3564] - - [326, 9923.44] + - [555, 9923.44] - - [1024, 33708, 1, 4005] - - [319, 10339.5] + - [548, 10339.5] - - [4096, 1024, 1, 3296] - - [326, 9879.78] + - [555, 9879.78] - - [1024, 4096, 1, 3263] - - [319, 9907.17] + - [548, 9907.17] - - [64, 25, 2512, 25] - - [296, 2848.17] + - [525, 2848.17] - - [1024, 4096, 1, 3130] - - [327, 9900.1] + - [556, 9900.1] - - [1024, 4096, 1, 3295] - - [327, 9895.45] + - [556, 9895.45] - - [1024, 33708, 1, 3925] - - [320, 10342.3] + - [549, 10342.3] - - [1024, 4096, 1, 3378] - - [319, 9921.37] + - [548, 9921.37] - - [4096, 1024, 1, 3720] - - [327, 9885.82] + - [556, 9885.82] - - [4096, 1024, 1, 3399] - - [326, 9880.65] + - [555, 9880.65] - - [4096, 1024, 1, 3543] - - [327, 9870.73] + - [556, 9870.73] - - [64, 9, 6544, 9] - - [299, 955.17] + - [528, 955.17] - - [4096, 1024, 1, 3497] - - [326, 9868.43] + - [555, 9868.43] - - [4096, 1024, 1, 3594] - - [327, 9876.88] + - [556, 9876.88] - - [1024, 4096, 1, 3144] - - [327, 9901.96] + - [556, 9901.96] - - [1024, 4096, 1, 3975] - - [320, 9950.19] + - [549, 9950.19] - - [4096, 1024, 1, 3205] - - [327, 9856.07] + - [556, 9856.07] - - [1024, 33708, 1, 3995] - - [319, 10331.1] + - [548, 10331.1] - - [1024, 4096, 1, 3392] - - [319, 9935.78] + - [548, 9935.78] - - [1024, 4096, 1, 3055] - - [327, 9893.25] + - [556, 9893.25] - - [1024, 4096, 1, 4026] - - [327, 9940.22] + - [556, 9940.22] - - [4096, 1024, 1, 3557] - - [326, 9884.0] + - [555, 9884.0] - - [4096, 1024, 1, 3515] - - [326, 9871.94] + - [555, 9871.94] - - [4096, 1024, 1, 3486] - - [327, 9860.74] + - [556, 9860.74] - - [4096, 1024, 1, 3457] - - [327, 9885.37] + - [556, 9885.37] - - [1024, 4096, 1, 3511] - - [319, 9928.24] + - [548, 9928.24] - - [4096, 1024, 1, 3138] - - [326, 9854.06] + - [555, 9854.06] - - [1024, 4096, 1, 3339] - - [320, 9912.89] + - [549, 9912.89] - - [1024, 4096, 1, 3939] - - [320, 9952.26] + - [549, 9952.26] - - [4096, 1024, 1, 3500] - - [320, 9863.62] + - [549, 9863.62] - - [4096, 1024, 1, 3395] - - [327, 9883.82] + - [556, 9883.82] - - [4096, 1024, 1, 3968] - - [327, 9920.36] + - [556, 9920.36] - - [4096, 1024, 1, 4020] - - [327, 9912.81] + - [556, 9912.81] - - [4096, 1024, 1, 3942] - - [326, 9910.17] + - [555, 9910.17] - - [1024, 1024, 1, 4032] - - [316, 9024.74] + - [545, 9024.74] - - [4096, 1024, 1, 3349] - - [327, 9866.04] + - [556, 9866.04] - - [1024, 4096, 1, 3322] - - [320, 9908.43] + - [549, 9908.43] - - [4096, 1024, 1, 3452] - - [326, 9872.69] + - [555, 9872.69] - - [1024, 4096, 1, 3417] - - [326, 9912.64] + - [555, 9912.64] - - [1024, 1024, 1, 4012] - - [324, 9085.47] + - [553, 9085.47] - - [1024, 4096, 1, 3526] - - [320, 9920.36] + - [549, 9920.36] - - [4096, 1024, 1, 3485] - - [320, 9861.64] + - [549, 9861.64] - - [1024, 1024, 1, 3681] - - [324, 8991.46] + - [553, 8991.46] - - [4096, 1024, 1, 3303] - - [327, 9861.3] + - [556, 9861.3] - - [4096, 1024, 1, 3344] - - [327, 9892.44] + - [556, 9892.44] - - [1024, 4096, 1, 3479] - - [327, 9921.77] + - [556, 9921.77] - - [4096, 1024, 1, 3300] - - [326, 9868.64] + - [555, 9868.64] - - [1024, 4096, 1, 3439] - - [320, 9918.29] + - [549, 9918.29] - - [4096, 1024, 1, 3280] - - [327, 9875.29] + - [556, 9875.29] - - [1024, 4096, 1, 3245] - - [319, 9910.49] + - [548, 9910.49] - - [1024, 4096, 1, 3328] - - [319, 9941.6] + - [548, 9941.6] - - [4096, 1024, 1, 3418] - - [319, 9870.76] + - [548, 9870.76] - - [1024, 4096, 1, 3493] - - [327, 9938.45] + - [556, 9938.45] - - [1024, 4096, 1, 3500] - - [319, 9916.93] + - [548, 9916.93] - - [1024, 4096, 1, 3166] - - [319, 9898.12] + - [548, 9898.12] - - [4096, 1024, 1, 3126] - - [320, 9847.04] + - [549, 9847.04] - - [1024, 4096, 1, 3277] - - [327, 9898.66] + - [556, 9898.66] - - [1024, 4096, 1, 3315] - - [326, 9923.11] + - [555, 9923.11] - - [1024, 1024, 1, 3927] - - [317, 8987.71] + - [546, 8987.71] - - [1024, 4096, 1, 3414] - - [319, 9916.01] + - [548, 9916.01] - - [4096, 1024, 1, 3531] - - [326, 9871.92] + - [555, 9871.92] - - [4096, 1024, 1, 3484] - - [319, 9867.86] + - [548, 9867.86] - - [1024, 4096, 1, 3180] - - [326, 9904.09] + - [555, 9904.09] - - [4096, 1024, 1, 3360] - - [326, 9879.57] + - [555, 9879.57] - - [1024, 33708, 1, 3990] - - [319, 10335.0] + - [548, 10335.0] - - [4096, 1024, 1, 3466] - - [326, 9875.02] + - [555, 9875.02] - - [1024, 4096, 1, 3428] - - [319, 9916.02] + - [548, 9916.02] - - [1024, 4096, 1, 3137] - - [326, 9913.27] + - [555, 9913.27] - - [4096, 1024, 1, 4059] - - [326, 9901.86] + - [555, 9901.86] - - [1024, 4096, 1, 3353] - - [326, 9914.6] + - [555, 9914.6] - - [1024, 4096, 1, 3942] - - [326, 9944.5] + - [555, 9944.5] - - [4096, 1024, 1, 3506] - - [319, 9875.75] + - [548, 9875.75] - - [1024, 1024, 1, 3894] - - [317, 8946.55] + - [546, 8946.55] - - [4096, 1024, 1, 3508] - - [327, 9877.67] + - [556, 9877.67] - - [64, 132, 480, 135] - - [310, 6164.86] + - [539, 6164.86] - - [4096, 1024, 1, 3956] - - [319, 9907.83] + - [548, 9907.83] - - [64, 7, 8192, 7] - - [298, 813.078] + - [527, 813.078] - - [1024, 4096, 1, 3272] - - [320, 9909.82] + - [549, 9909.82] - - [1024, 4096, 1, 3443] - - [327, 9929.83] + - [556, 9929.83] - - [1024, 4096, 1, 3375] - - [327, 9909.23] + - [556, 9909.23] - - [1024, 4096, 1, 3525] - - [327, 9929.27] + - [556, 9929.27] - - [4096, 1024, 1, 3472] - - [326, 9889.97] + - [555, 9889.97] - - [1024, 4096, 1, 3520] - - [319, 9947.79] + - [548, 9947.79] - - [4096, 1024, 1, 3322] - - [326, 9862.98] + - [555, 9862.98] - - [4096, 1024, 1, 3387] - - [326, 9861.62] + - [555, 9861.62] - - [64, 8, 7280, 8] - - [304, 1024.1] + - [533, 1024.1] - - [1024, 33708, 1, 3939] - - [319, 10339.9] + - [548, 10339.9] - - [4096, 1024, 1, 3345] - - [327, 9873.68] + - [556, 9873.68] - - [4096, 1024, 1, 2967] - - [326, 9839.21] + - [555, 9839.21] - - [1024, 4096, 1, 3453] - - [319, 9905.81] + - [548, 9905.81] - - [1024, 4096, 1, 3640] - - [326, 9934.05] + - [555, 9934.05] - - [4096, 1024, 1, 3291] - - [320, 9860.84] + - [549, 9860.84] - - [1024, 4096, 1, 3350] - - [327, 9918.03] + - [556, 9918.03] - - [4096, 1024, 1, 3417] - - [326, 9864.61] + - [555, 9864.61] - - [64, 135, 480, 135] - - [310, 6265.45] + - [539, 6265.45] - - [1024, 4096, 1, 3467] - - [320, 9906.95] + - [549, 9906.95] - - [1024, 4096, 1, 3491] - - [326, 9933.3] + - [555, 9933.3] - - [1024, 4096, 1, 3822] - - [326, 9938.75] + - [555, 9938.75] - - [4096, 1024, 1, 3292] - - [326, 9849.21] + - [555, 9849.21] - - [1024, 4096, 1, 3231] - - [319, 9905.82] + - [548, 9905.82] - - [1024, 4096, 1, 3364] - - [320, 9930.32] + - [549, 9930.32] - - [1024, 4096, 1, 3995] - - [320, 9943.76] + - [549, 9943.76] - - [1024, 4096, 1, 3545] - - [319, 9928.53] + - [548, 9928.53] - - [1024, 1024, 1, 3876] - - [317, 9003.04] + - [546, 9003.04] - - [1024, 4096, 1, 3186] - - [319, 9921.01] + - [548, 9921.01] - - [4096, 1024, 1, 3432] - - [326, 9875.29] + - [555, 9875.29] - - [64, 84, 752, 85] - - [297, 5704.51] + - [526, 5704.51] - - [4096, 1024, 1, 3367] - - [320, 9868.06] + - [549, 9868.06] - - [4096, 1024, 1, 3503] - - [327, 9871.01] + - [556, 9871.01] - - [1024, 4096, 1, 3095] - - [320, 9902.9] + - [549, 9902.9] - - [4096, 1024, 1, 3465] - - [327, 9872.17] + - [556, 9872.17] - - [1024, 4096, 1, 3402] - - [326, 9914.66] + - [555, 9914.66] - - [4096, 1024, 1, 3140] - - [326, 9847.95] + - [555, 9847.95] - - [1024, 1024, 1, 4050] - - [323, 9055.75] + - [552, 9055.75] - - [4096, 1024, 1, 3424] - - [320, 9894.62] + - [549, 9894.62] - - [4096, 1024, 1, 3257] - - [319, 9860.97] + - [548, 9860.97] - - [4096, 1024, 1, 2917] - - [326, 9845.91] + - [555, 9845.91] - - [1024, 33708, 1, 3640] - - [319, 10321.7] + - [548, 10321.7] - - [1024, 4096, 1, 3456] - - [319, 9950.35] + - [548, 9950.35] - - [1024, 4096, 1, 3014] - - [319, 9907.97] + - [548, 9907.97] - - [4096, 1024, 1, 3372] - - [327, 9868.37] + - [556, 9868.37] - - [64, 132, 480, 132] - - [310, 6121.62] + - [539, 6121.62] - - [1024, 4096, 1, 3294] - - [327, 9903.23] + - [556, 9903.23] - - [4096, 1024, 1, 3446] - - [327, 9871.69] + - [556, 9871.69] - - [1024, 4096, 1, 3389] - - [320, 9909.27] + - [549, 9909.27] - - [4096, 1024, 1, 3259] - - [326, 9860.76] + - [555, 9860.76] - - [4096, 1024, 1, 3544] - - [326, 9878.76] + - [555, 9878.76] - - [4096, 1024, 1, 3479] - - [327, 9873.97] + - [556, 9873.97] - - [4096, 1024, 1, 3542] - - [326, 9878.97] + - [555, 9878.97] - - [4096, 1024, 1, 3321] - - [319, 9861.13] + - [548, 9861.13] - - [1024, 4096, 1, 3147] - - [319, 9894.77] + - [548, 9894.77] - - [1024, 4096, 1, 3944] - - [319, 9950.51] + - [548, 9950.51] - - [4096, 1024, 1, 3870] - - [327, 9881.74] + - [556, 9881.74] - - [1024, 4096, 1, 3308] - - [319, 9907.26] + - [548, 9907.26] - - [4096, 1024, 1, 3401] - - [326, 9864.59] + - [555, 9864.59] - - [1024, 4096, 1, 3395] - - [319, 9929.03] + - [548, 9929.03] - - [64, 99, 624, 102] - - [295, 5651.36] + - [524, 5651.36] - - [1024, 4096, 1, 3563] - - [326, 9922.76] + - [555, 9922.76] - - [1024, 33708, 1, 3870] - - [319, 10325.4] + - [548, 10325.4] - - [4096, 1024, 1, 3494] - - [326, 9875.37] + - [555, 9875.37] - - [1024, 4096, 1, 3271] - - [319, 9913.09] + - [548, 9913.09] - - [1024, 33708, 1, 3910] - - [319, 10341.5] + - [548, 10341.5] - - [1024, 4096, 1, 3287] - - [327, 9924.87] + - [556, 9924.87] - - [1024, 33708, 1, 3860] - - [319, 10330.7] + - [548, 10330.7] - - [64, 143, 432, 148] - - [312, 6571.78] + - [541, 6571.78] - - [1024, 1024, 1, 3584] - - [324, 8975.31] + - [553, 8975.31] - - [64, 162, 400, 162] - - [314, 6822.26] + - [543, 6822.26] - - [4096, 1024, 1, 3341] - - [326, 9854.66] + - [555, 9854.66] - - [1024, 4096, 1, 3136] - - [319, 9926.86] + - [548, 9926.86] - - [4096, 1024, 1, 3439] - - [326, 9854.33] + - [555, 9854.33] - - [64, 148, 432, 147] - - [310, 6677.61] + - [539, 6677.61] - - [1024, 4096, 1, 3751] - - [326, 9938.48] + - [555, 9938.48] - - [1024, 4096, 1, 3301] - - [326, 9919.15] + - [555, 9919.15] - - [4096, 1024, 1, 3468] - - [327, 9859.83] + - [556, 9859.83] - - [1024, 4096, 1, 3416] - - [327, 9918.52] + - [556, 9918.52] - - [4096, 1024, 1, 3163] - - [326, 9854.65] + - [555, 9854.65] - - [1024, 4096, 1, 3230] - - [320, 9897.54] + - [549, 9897.54] - - [1024, 4096, 1, 3581] - - [320, 9915.48] + - [549, 9915.48] - - [1024, 1024, 1, 3960] - - [322, 9045.86] + - [551, 9045.86] - - [4096, 1024, 1, 3463] - - [327, 9884.74] + - [556, 9884.74] - - [1024, 4096, 1, 3478] - - [320, 9927.02] + - [549, 9927.02] - - [4096, 1024, 1, 3262] - - [326, 9852.22] + - [555, 9852.22] - - [1024, 4096, 1, 3438] - - [326, 9912.68] + - [555, 9912.68] - - [1024, 4096, 1, 3244] - - [319, 9900.51] + - [548, 9900.51] - - [1024, 4096, 1, 3445] - - [319, 9920.32] + - [548, 9920.32] - - [4096, 1024, 1, 3328] - - [326, 9888.07] + - [555, 9888.07] - - [1024, 4096, 1, 3492] - - [320, 9937.22] + - [549, 9937.22] - - [4096, 1024, 1, 3211] - - [320, 9847.95] + - [549, 9847.95] - - [1024, 4096, 1, 3910] - - [327, 9946.57] + - [556, 9946.57] - - [1024, 4096, 1, 3314] - - [319, 9932.6] + - [548, 9932.6] - - [4096, 1024, 1, 3859] - - [326, 9902.84] + - [555, 9902.84] - - [4096, 1024, 1, 3383] - - [326, 9875.2] + - [555, 9875.2] - - [1024, 4096, 1, 3409] - - [327, 9926.79] + - [556, 9926.79] - - [1024, 4096, 1, 4020] - - [319, 9941.8] + - [548, 9941.8] - - [4096, 1024, 1, 3530] - - [326, 9872.81] + - [555, 9872.81] - - [4096, 1024, 1, 3411] - - [327, 9875.02] + - [556, 9875.02] - - [1024, 4096, 1, 3566] - - [327, 9921.1] + - [556, 9921.1] - - [4096, 1024, 1, 3493] - - [319, 9875.74] + - [548, 9875.74] - - [4096, 1024, 1, 3184] - - [326, 9873.14] + - [555, 9873.14] - - [1024, 4096, 1, 3072] - - [319, 9923.79] + - [548, 9923.79] - - [1024, 4096, 1, 3431] - - [320, 9911.03] + - [549, 9911.03] - - [4096, 1024, 1, 3306] - - [327, 9853.42] + - [556, 9853.42] - - [1024, 4096, 1, 3352] - - [327, 9913.32] + - [556, 9913.32] - - [4096, 1024, 1, 3295] - - [326, 9862.68] + - [555, 9862.68] - - [64, 123, 528, 122] - - [292, 6950.25] + - [521, 6950.25] - - [1024, 4096, 1, 3517] - - [320, 9920.06] + - [549, 9920.06] - - [64, 102, 624, 101] - - [300, 5791.49] + - [529, 5791.49] - - [4096, 1024, 1, 3426] - - [326, 9891.14] + - [555, 9891.14] - - [4096, 1024, 1, 3385] - - [326, 9868.41] + - [555, 9868.41] - - [1024, 1024, 1, 3978] - - [317, 9008.48] + - [546, 9008.48] - - [4096, 1024, 1, 3572] - - [319, 9884.81] + - [548, 9884.81] - - [4096, 1024, 1, 3459] - - [326, 9892.17] + - [555, 9892.17] - - [1024, 4096, 1, 3374] - - [327, 9908.52] + - [556, 9908.52] - - [4096, 1024, 1, 3166] - - [326, 9832.45] + - [555, 9832.45] - - [4096, 1024, 1, 3093] - - [327, 9841.25] + - [556, 9841.25] - - [4096, 1024, 1, 3523] - - [320, 9879.05] + - [549, 9879.05] - - [4096, 1024, 1, 3413] - - [320, 9880.81] + - [549, 9880.81] - - [1024, 4096, 1, 3996] - - [319, 9948.14] + - [548, 9948.14] - - [1024, 4096, 1, 3452] - - [327, 9915.97] + - [556, 9915.97] - - [4096, 1024, 1, 3232] - - [327, 9876.54] + - [556, 9876.54] - - [4096, 1024, 1, 3400] - - [319, 9867.15] + - [548, 9867.15] - - [4096, 1024, 1, 3334] - - [326, 9868.99] + - [555, 9868.99] - - [1024, 4096, 1, 3345] - - [319, 9920.6] + - [548, 9920.6] - - [1024, 4096, 1, 3538] - - [326, 9933.34] + - [555, 9933.34] - - [1024, 4096, 1, 3466] - - [326, 9920.85] + - [555, 9920.85] - - [4096, 1024, 1, 3315] - - [326, 9876.87] + - [555, 9876.87] - - [4096, 1024, 1, 3214] - - [327, 9847.93] + - [556, 9847.93] - - [1024, 33708, 1, 3900] - - [319, 10331.7] + - [548, 10331.7] - - [64, 160, 400, 160] - - [312, 7440.61] + - [541, 7440.61] - - [1024, 4096, 1, 3367] - - [326, 9926.32] + - [555, 9926.32] - - [1024, 4096, 1, 2917] - - [327, 9904.57] + - [556, 9904.57] - - [1024, 1024, 1, 3995] - - [318, 9000.33] + - [547, 9000.33] - - [64, 132, 480, 134] - - [310, 6146.88] + - [539, 6146.88] - - [1024, 4096, 1, 3544] - - [327, 9924.14] + - [556, 9924.14] - - [4096, 1024, 1, 3414] - - [327, 9867.9] + - [556, 9867.9] - - [4096, 1024, 1, 3565] - - [320, 9870.13] + - [549, 9870.13] - - [1024, 4096, 1, 3512] - - [326, 9919.84] + - [555, 9919.84] - - [1024, 4096, 1, 3191] - - [327, 9914.79] + - [556, 9914.79] - - [64, 27, 2336, 27] - - [294, 3054.71] + - [523, 3054.71] - - [1024, 4096, 1, 3289] - - [327, 9917.2] + - [556, 9917.2] - - [4096, 1024, 1, 3290] - - [326, 9858.41] + - [555, 9858.41] - - [1024, 4096, 1, 3211] - - [327, 9897.16] + - [556, 9897.16] - - [1024, 33708, 1, 3969] - - [320, 10336.1] + - [549, 10336.1] - - [4096, 1024, 1, 3566] - - [326, 9863.0] + - [555, 9863.0] - - [64, 111, 576, 111] - - [300, 6400.91] + - [529, 6400.91] - - [1024, 4096, 1, 3459] - - [326, 9923.03] + - [555, 9923.03] - - [1024, 4096, 1, 3372] - - [319, 9909.86] + - [548, 9909.86] - - [4096, 1024, 1, 3339] - - [326, 9859.3] + - [555, 9859.3] - - [4096, 1024, 1, 3425] - - [326, 9889.34] + - [555, 9889.34] - - [4096, 1024, 1, 3388] - - [326, 9871.67] + - [555, 9871.67] - - [1024, 4096, 1, 3531] - - [319, 9919.0] + - [548, 9919.0] - - [4096, 1024, 1, 3286] - - [327, 9868.42] + - [556, 9868.42] - - [4096, 1024, 1, 3462] - - [326, 9881.88] + - [555, 9881.88] - - [1024, 4096, 1, 3388] - - [319, 9904.69] + - [548, 9904.69] - - [4096, 1024, 1, 3165] - - [319, 9836.33] + - [548, 9836.33] - - [4096, 1024, 1, 3304] - - [326, 9857.55] + - [555, 9857.55] - - [1024, 4096, 1, 2736] - - [326, 9901.07] + - [555, 9901.07] - - [4096, 1024, 1, 3397] - - [326, 9872.1] + - [555, 9872.1] - - [64, 38, 1680, 38] - - [293, 3459.52] + - [522, 3459.52] - - [1024, 4096, 1, 3311] - - [327, 9908.32] + - [556, 9908.32] - - [1024, 4096, 1, 3394] - - [327, 9929.43] + - [556, 9929.43] - - [4096, 1024, 1, 2736] - - [326, 9833.88] + - [555, 9833.88] - - [1024, 4096, 1, 3559] - - [320, 9925.33] + - [549, 9925.33] - - [4096, 1024, 1, 3180] - - [326, 9838.05] + - [555, 9838.05] - - [1024, 4096, 1, 3480] - - [319, 9922.46] + - [548, 9922.46] - - [4096, 1024, 1, 3318] - - [326, 9867.87] + - [555, 9867.87] - - [4096, 1024, 1, 3213] - - [326, 9846.02] + - [555, 9846.02] - - [1024, 4096, 1, 3286] - - [326, 9912.14] + - [555, 9912.14] - - [4096, 1024, 1, 3471] - - [326, 9874.24] + - [555, 9874.24] - - [1024, 4096, 1, 3381] - - [327, 9922.96] + - [556, 9922.96] - - [64, 100, 624, 100] - - [301, 5705.24] + - [530, 5705.24] - - [4096, 1024, 1, 3502] - - [326, 9872.44] + - [555, 9872.44] - - [64, 16, 3840, 16] - - [307, 2091.67] + - [536, 2091.67] - - [1024, 4096, 1, 3552] - - [319, 9943.89] + - [548, 9943.89] - - [4096, 1024, 1, 3519] - - [327, 9869.95] + - [556, 9869.95] - - [1024, 4096, 1, 3300] - - [320, 9916.15] + - [549, 9916.15] - - [1024, 4096, 1, 3419] - - [319, 9914.06] + - [548, 9914.06] - - [4096, 1024, 1, 4030] - - [320, 9893.73] + - [549, 9893.73] - - [4096, 1024, 1, 3976] - - [327, 9898.35] + - [556, 9898.35] - - [1024, 4096, 1, 3473] - - [327, 9928.42] + - [556, 9928.42] - - [1024, 1024, 1, 3977] - - [324, 9009.33] + - [553, 9009.33] - - [4096, 1024, 1, 3428] - - [326, 9876.79] + - [555, 9876.79] - - [1024, 4096, 1, 3433] - - [320, 9923.92] + - [549, 9923.92] - - [4096, 1024, 1, 3534] - - [320, 9864.0] + - [549, 9864.0] - - [4096, 1024, 1, 3461] - - [326, 9873.12] + - [555, 9873.12] - - [4096, 1024, 1, 3681] - - [326, 9898.57] + - [555, 9898.57] - - [4096, 1024, 1, 3495] - - [327, 9876.08] + - [556, 9876.08] - - [4096, 1024, 1, 3351] - - [326, 9879.71] + - [555, 9879.71] - - [1024, 4096, 1, 4059] - - [319, 9948.61] + - [548, 9948.61] - - [4096, 1024, 1, 3990] - - [326, 9900.76] + - [555, 9900.76] - - [1024, 4096, 1, 3325] - - [320, 9903.3] + - [549, 9903.3] - - [1024, 4096, 1, 3408] - - [326, 9932.15] + - [555, 9932.15] - - [64, 59, 1088, 59] - - [300, 5343.77] + - [529, 5343.77] - - [4096, 1024, 1, 3394] - - [327, 9878.17] + - [556, 9878.17] - - [1024, 4096, 1, 3573] - - [327, 9935.3] + - [556, 9935.3] - - [4096, 1024, 1, 3386] - - [326, 9866.38] + - [555, 9866.38] - - [4096, 1024, 1, 3540] - - [326, 9882.33] + - [555, 9882.33] - - [1024, 4096, 1, 3182] - - [320, 9894.45] + - [549, 9894.45] - - [1024, 4096, 1, 3430] - - [319, 9915.24] + - [548, 9915.24] - - [1024, 4096, 1, 3236] - - [327, 9920.56] + - [556, 9920.56] - - [4096, 1024, 1, 2977] - - [326, 9848.08] + - [555, 9848.08] - - [1024, 4096, 1, 3355] - - [326, 9908.78] + - [555, 9908.78] - - [4096, 1024, 1, 3139] - - [326, 9850.71] + - [555, 9850.71] - - [4096, 1024, 1, 3516] - - [320, 9874.21] + - [549, 9874.21] - - [4096, 1024, 1, 3368] - - [320, 9872.64] + - [549, 9872.64] - - [4096, 1024, 1, 3559] - - [319, 9884.32] + - [548, 9884.32] - - [64, 11, 5456, 11] - - [307, 1382.67] + - [536, 1382.67] - - [1024, 4096, 1, 3506] - - [326, 9937.69] + - [555, 9937.69] - - [1024, 4096, 1, 3145] - - [319, 9905.11] + - [548, 9905.11] - - [1024, 4096, 1, 3369] - - [326, 9912.71] + - [555, 9912.71] - - [64, 112, 576, 112] - - [292, 6583.56] + - [521, 6583.56] - - [4096, 1024, 1, 3522] - - [326, 9889.47] + - [555, 9889.47] - - [1024, 33708, 1, 3894] - - [319, 10337.5] + - [548, 10337.5] - - [64, 159, 400, 162] - - [310, 7057.09] + - [539, 7057.09] - - [4096, 1024, 1, 3336] - - [326, 9867.67] + - [555, 9867.67] - - [1024, 4096, 1, 3382] - - [320, 9915.9] + - [549, 9915.9] - - [4096, 1024, 1, 3533] - - [326, 9878.56] + - [555, 9878.56] - - [4096, 1024, 1, 4050] - - [327, 9916.82] + - [556, 9916.82] - - [4096, 1024, 1, 3480] - - [320, 9869.32] + - [549, 9869.32] - - [1024, 4096, 1, 3344] - - [319, 9935.61] + - [548, 9935.61] - - [64, 122, 528, 122] - - [292, 6871.14] + - [521, 6871.14] - - [1024, 4096, 1, 3509] - - [320, 9925.8] + - [549, 9925.8] - - [1024, 4096, 1, 3956] - - [319, 9958.26] + - [548, 9958.26] - - [4096, 1024, 1, 3616] - - [326, 9904.63] + - [555, 9904.63] - - [1024, 4096, 1, 3366] - - [319, 9919.47] + - [548, 9919.47] - - [4096, 1024, 1, 2935] - - [319, 9833.23] + - [548, 9833.23] - - [4096, 1024, 1, 3393] - - [326, 9877.45] + - [555, 9877.45] - - [4096, 1024, 1, 3547] - - [320, 9865.1] + - [549, 9865.1] - - [1024, 4096, 1, 3499] - - [327, 9912.49] + - [556, 9912.49] - - [4096, 1024, 1, 3357] - - [326, 9855.28] + - [555, 9855.28] - - [4096, 1024, 1, 3272] - - [326, 9861.97] + - [555, 9861.97] - - [4096, 1024, 1, 3207] - - [326, 9847.78] + - [555, 9847.78] - - [4096, 1024, 1, 3894] - - [326, 9918.86] + - [555, 9918.86] - - [1024, 4096, 1, 3444] - - [326, 9932.71] + - [555, 9932.71] - - [4096, 1024, 1, 3561] - - [326, 9872.61] + - [555, 9872.61] - - [4096, 1024, 1, 3376] - - [326, 9885.59] + - [555, 9885.59] - - [1024, 4096, 1, 3458] - - [326, 9929.39] + - [555, 9929.39] - - [4096, 1024, 1, 3231] - - [320, 9847.08] + - [549, 9847.08] - - [64, 228, 272, 228] - - [321, 7302.69] + - [550, 7302.69] - - [1024, 4096, 1, 3505] - - [327, 9931.63] + - [556, 9931.63] - - [4096, 1024, 1, 3277] - - [326, 9857.2] + - [555, 9857.2] - - [64, 21, 2976, 21] - - [296, 2436.14] + - [525, 2436.14] - - [1024, 4096, 1, 3391] - - [326, 9911.25] + - [555, 9911.25] - - [64, 32, 1984, 32] - - [308, 3572.17] + - [537, 3572.17] - - [1024, 4096, 1, 3536] - - [327, 9946.9] + - [556, 9946.9] - - [1024, 4096, 1, 3063] - - [326, 9906.92] + - [555, 9906.92] - - [1024, 1024, 1, 3925] - - [318, 9011.45] + - [547, 9011.45] - - [1024, 4096, 1, 3189] - - [320, 9900.95] + - [549, 9900.95] - - [1024, 4096, 1, 2505] - - [326, 9854.85] + - [555, 9854.85] - - [4096, 1024, 1, 3454] - - [319, 9864.96] + - [548, 9864.96] - - [1024, 4096, 1, 3405] - - [327, 9906.33] + - [556, 9906.33] - - [1024, 33708, 1, 4050] - - [320, 10343.7] + - [549, 10343.7] - - [4096, 1024, 1, 3520] - - [326, 9887.03] + - [555, 9887.03] - - [64, 93, 688, 93] - - [303, 6222.86] + - [532, 6222.86] - - [1024, 4096, 1, 3487] - - [327, 9918.69] + - [556, 9918.69] - - [1024, 4096, 1, 3558] - - [327, 9930.99] + - [556, 9930.99] - - [4096, 1024, 1, 3297] - - [326, 9874.31] + - [555, 9874.31] - - [1024, 1024, 1, 3840] - - [322, 9075.42] + - [551, 9075.42] - - [1024, 4096, 1, 3483] - - [326, 9915.38] + - [555, 9915.38] - - [1024, 1024, 1, 3956] - - [325, 9010.03] + - [554, 9010.03] - - [1024, 33708, 1, 3751] - - [320, 10325.9] + - [549, 10325.9] - - [4096, 1024, 1, 3380] - - [326, 9888.47] + - [555, 9888.47] - - [1024, 4096, 1, 3380] - - [319, 9927.25] + - [548, 9927.25] - - [1024, 4096, 1, 3396] - - [327, 9931.96] + - [556, 9931.96] - - [1024, 4096, 1, 3497] - - [320, 9914.86] + - [549, 9914.86] - - [1024, 4096, 1, 3502] - - [327, 9921.52] + - [556, 9921.52] - - [1024, 1024, 1, 3976] - - [322, 9060.3] + - [551, 9060.3] - - [1024, 4096, 1, 3138] - - [320, 9908.66] + - [549, 9908.66] - - [4096, 1024, 1, 3939] - - [319, 9910.23] + - [548, 9910.23] - - [1024, 4096, 1, 3303] - - [320, 9916.64] + - [549, 9916.64] - - [64, 111, 576, 112] - - [300, 6495.19] + - [529, 6495.19] - - [1024, 4096, 1, 3418] - - [326, 9913.35] + - [555, 9913.35] - - [1024, 4096, 1, 3224] - - [320, 9904.05] + - [549, 9904.05] - - [4096, 1024, 1, 3978] - - [326, 9896.28] + - [555, 9896.28] - - [1024, 4096, 1, 3472] - - [319, 9937.48] + - [548, 9937.48] - - [4096, 1024, 1, 3353] - - [327, 9863.97] + - [556, 9863.97] - - [4096, 1024, 1, 3362] - - [326, 9871.06] + - [555, 9871.06] - - [1024, 33708, 1, 3978] - - [319, 10325.4] + - [548, 10325.4] - - [64, 100, 624, 102] - - [295, 5695.67] + - [524, 5695.67] - - [1024, 4096, 1, 3432] - - [327, 9915.56] + - [556, 9915.56] - - [1024, 4096, 1, 3139] - - [326, 9914.21] + - [555, 9914.21] - - [1024, 4096, 1, 3341] - - [327, 9912.1] + - [556, 9912.1] - - [1024, 4096, 1, 3494] - - [320, 9924.6] + - [549, 9924.6] - - [1024, 4096, 1, 3969] - - [319, 9952.28] + - [548, 9952.28] - - [1024, 4096, 1, 3163] - - [327, 9911.79] + - [556, 9911.79] - - [1024, 1024, 1, 3955] - - [317, 9097.86] + - [546, 9097.86] - - [4096, 1024, 1, 3405] - - [326, 9853.84] + - [555, 9853.84] - - [1024, 1024, 1, 4030] - - [317, 9083.86] + - [546, 9083.86] - - [4096, 1024, 1, 3453] - - [326, 9858.88] + - [555, 9858.88] - - [1024, 4096, 1, 3411] - - [327, 9926.54] + - [556, 9926.54] - - [1024, 4096, 1, 3527] - - [320, 9922.65] + - [549, 9922.65] - - [4096, 1024, 1, 3474] - - [326, 9878.49] + - [555, 9878.49] - - [1024, 4096, 1, 3572] - - [326, 9932.0] + - [555, 9932.0] - - [4096, 1024, 1, 3293] - - [326, 9848.26] + - [555, 9848.26] - - [4096, 1024, 1, 3247] - - [326, 9861.45] + - [555, 9861.45] - - [64, 15, 4096, 15] - - [307, 1955.75] + - [536, 1955.75] - - [1024, 4096, 1, 3425] - - [327, 9936.4] + - [556, 9936.4] - - [1024, 4096, 1, 3354] - - [319, 9917.55] + - [548, 9917.55] - - [4096, 1024, 1, 3382] - - [326, 9885.49] + - [555, 9885.49] - - [4096, 1024, 1, 3236] - - [326, 9860.6] + - [555, 9860.6] - - [1024, 4096, 1, 3519] - - [327, 9919.3] + - [556, 9919.3] - - [4096, 1024, 1, 3354] - - [326, 9854.75] + - [555, 9854.75] - - [4096, 1024, 1, 3501] - - [327, 9869.62] + - [556, 9869.62] - - [1024, 1024, 1, 3906] - - [325, 9104.99] + - [554, 9104.99] - - [4096, 1024, 1, 3266] - - [326, 9873.97] + - [555, 9873.97] - - [64, 101, 624, 102] - - [295, 5765.52] + - [524, 5765.52] - - [1024, 4096, 1, 3368] - - [326, 9909.77] + - [555, 9909.77] - - [1024, 4096, 1, 4030] - - [327, 9940.27] + - [556, 9940.27] - - [1024, 4096, 1, 3533] - - [320, 9916.64] + - [549, 9916.64] - - [4096, 1024, 1, 3332] - - [327, 9876.45] + - [556, 9876.45] - - [4096, 1024, 1, 3584] - - [326, 9896.6] + - [555, 9896.6] - - [1024, 4096, 1, 3616] - - [326, 9957.18] + - [555, 9957.18] - - [4096, 1024, 1, 3265] - - [326, 9877.78] + - [555, 9877.78] - - [4096, 1024, 1, 3361] - - [326, 9888.61] + - [555, 9888.61] - - [4096, 1024, 1, 3467] - - [326, 9863.4] + - [555, 9863.4] - - [1024, 4096, 1, 3454] - - [320, 9904.89] + - [549, 9904.89] - - [1024, 4096, 1, 3101] - - [327, 9893.12] + - [556, 9893.12] - - [1024, 4096, 1, 3508] - - [327, 9931.54] + - [556, 9931.54] - - [4096, 1024, 1, 3267] - - [326, 9864.48] + - [555, 9864.48] - - [64, 54, 1184, 54] - - [292, 4906.02] + - [521, 4906.02] - - [4096, 1024, 1, 3419] - - [326, 9872.56] + - [555, 9872.56] - - [4096, 1024, 1, 3822] - - [326, 9892.63] + - [555, 9892.63] - - [1024, 4096, 1, 3266] - - [326, 9918.58] + - [555, 9918.58] - - [4096, 1024, 1, 3440] - - [327, 9890.16] + - [556, 9890.16] - - [1024, 4096, 1, 3361] - - [326, 9930.97] + - [555, 9930.97] - - [1024, 4096, 1, 3546] - - [320, 9926.56] + - [549, 9926.56] - - [4096, 1024, 1, 3473] - - [326, 9889.06] + - [555, 9889.06] - - [4096, 1024, 1, 3546] - - [327, 9872.27] + - [556, 9872.27] - - [1024, 4096, 1, 3088] - - [320, 9918.03] + - [549, 9918.03] - - [1024, 4096, 1, 3535] - - [327, 9921.2] + - [556, 9921.2] - - [1024, 4096, 1, 3447] - - [327, 9920.63] + - [556, 9920.63] - - [1024, 4096, 1, 3560] - - [326, 9925.48] + - [555, 9925.48] - - [1024, 4096, 1, 3422] - - [320, 9922.21] + - [549, 9922.21] - - [1024, 4096, 1, 3469] - - [319, 9906.18] + - [548, 9906.18] - - [4096, 1024, 1, 3488] - - [326, 9903.26] + - [555, 9903.26] - - [1024, 4096, 1, 3110] - - [326, 9906.76] + - [555, 9906.76] - - [1024, 4096, 1, 3265] - - [327, 9916.69] + - [556, 9916.69] - - [1024, 4096, 1, 3291] - - [326, 9902.73] + - [555, 9902.73] - - [1024, 4096, 1, 3390] - - [327, 9907.22] + - [556, 9907.22] - - [4096, 1024, 1, 3046] - - [326, 9847.68] + - [555, 9847.68] - - [1024, 4096, 1, 3539] - - [327, 9933.49] + - [556, 9933.49] - - [4096, 1024, 1, 3221] - - [327, 9860.74] + - [556, 9860.74] - - [4096, 1024, 1, 3433] - - [326, 9872.74] + - [555, 9872.74] - - [4096, 1024, 1, 3364] - - [327, 9881.91] + - [556, 9881.91] - - [4096, 1024, 1, 3470] - - [326, 9858.56] + - [555, 9858.56] - - [1024, 4096, 1, 3404] - - [319, 9907.27] + - [548, 9907.27] - - [1024, 33708, 1, 3968] - - [320, 10350.3] + - [549, 10350.3] - - [4096, 1024, 1, 3088] - - [326, 9869.06] + - [555, 9869.06] - - [1024, 4096, 1, 3247] - - [326, 9901.02] + - [555, 9901.02] - - [1024, 33708, 1, 3996] - - [319, 10328.5] + - [548, 10328.5] - - [4096, 1024, 1, 3482] - - [327, 9866.99] + - [556, 9866.99] - - [1024, 1024, 1, 3796] - - [322, 9031.68] + - [551, 9031.68] - - [4096, 1024, 1, 3995] - - [327, 9896.78] + - [556, 9896.78] - - [1024, 1024, 1, 3859] - - [324, 9097.36] + - [553, 9097.36] - - [1024, 4096, 1, 3280] - - [320, 9934.05] + - [549, 9934.05] - - [4096, 1024, 1, 3271] - - [327, 9860.09] + - [556, 9860.09] - - [64, 10, 5952, 10] - - [307, 1221.02] + - [536, 1221.02] - - [4096, 1024, 1, 3545] - - [326, 9877.35] + - [555, 9877.35] - - [4096, 1024, 1, 3476] - - [319, 9882.57] + - [548, 9882.57] - - [4096, 1024, 1, 3496] - - [320, 9880.5] + - [549, 9880.5] - - [4096, 1024, 1, 3191] - - [320, 9858.7] + - [549, 9858.7] - - [4096, 1024, 1, 3311] - - [327, 9853.2] + - [556, 9853.2] - - [1024, 4096, 1, 3302] - - [327, 9919.32] + - [556, 9919.32] - - [1024, 4096, 1, 3681] - - [326, 9944.99] + - [555, 9944.99] - - [4096, 1024, 1, 3582] - - [319, 9869.77] + - [548, 9869.77] - - [4096, 1024, 1, 3421] - - [327, 9856.08] + - [556, 9856.08] - - [4096, 1024, 1, 3560] - - [320, 9884.48] + - [549, 9884.48] - - [1024, 4096, 1, 3495] - - [327, 9930.13] + - [556, 9930.13] - - [4096, 1024, 1, 3186] - - [326, 9870.59] + - [555, 9870.59] - - [4096, 1024, 1, 3925] - - [326, 9904.0] + - [555, 9904.0] - - [64, 71, 896, 71] - - [311, 5004.79] + - [540, 5004.79] - - [1024, 4096, 1, 3435] - - [327, 9916.58] + - [556, 9916.58] - - [4096, 1024, 1, 3434] - - [326, 9871.29] + - [555, 9871.29] - - [1024, 33708, 1, 4012] - - [319, 10332.5] + - [548, 10332.5] - - [1024, 4096, 1, 3340] - - [319, 9918.11] + - [548, 9918.11] - - [1024, 1024, 1, 3860] - - [317, 8999.36] + - [546, 8999.36] - - [4096, 1024, 1, 3489] - - [326, 9882.02] + - [555, 9882.02] - - [1024, 4096, 1, 3162] - - [327, 9906.28] + - [556, 9906.28] - - [4096, 1024, 1, 3436] - - [326, 9858.12] + - [555, 9858.12] - - [1024, 1024, 1, 4005] - - [323, 9043.06] + - [552, 9043.06] - - [64, 84, 752, 84] - - [296, 5629.93] + - [525, 5629.93] - - [4096, 1024, 1, 3574] - - [326, 9886.7] + - [555, 9886.7] - - [4096, 1024, 1, 3469] - - [319, 9856.26] + - [548, 9856.26] - - [1024, 4096, 1, 3410] - - [320, 9924.74] + - [549, 9924.74] - - [1024, 4096, 1, 3216] - - [319, 9930.67] + - [548, 9930.67] - - [4096, 1024, 1, 3095] - - [326, 9847.01] + - [555, 9847.01] - - [1024, 1024, 1, 3990] - - [325, 9089.04] + - [554, 9089.04] - - [4096, 1024, 1, 3448] - - [326, 9863.94] + - [555, 9863.94] - - [1024, 4096, 1, 3176] - - [327, 9914.01] + - [556, 9914.01] - - [64, 49, 1296, 49] - - [292, 4437.46] + - [521, 4437.46] - - [4096, 1024, 1, 2918] - - [326, 9830.93] + - [555, 9830.93] - - [64, 14, 4368, 14] - - [306, 1802.47] + - [535, 1802.47] - - [1024, 4096, 1, 3424] - - [326, 9934.05] + - [555, 9934.05] - - [4096, 1024, 1, 3402] - - [319, 9863.12] + - [548, 9863.12] - - [4096, 1024, 1, 3145] - - [320, 9856.56] + - [549, 9856.56] - - [64, 134, 480, 134] - - [312, 6184.05] + - [541, 6184.05] - - [1024, 33708, 1, 3976] - - [320, 10330.1] + - [549, 10330.1] - - [4096, 1024, 1, 3518] - - [319, 9856.07] + - [548, 9856.07] - - [4096, 1024, 1, 3110] - - [326, 9856.46] + - [555, 9856.46] - - [4096, 1024, 1, 3325] - - [326, 9852.36] + - [555, 9852.36] - - [1024, 33708, 1, 3999] - - [319, 10329.7] + - [548, 10329.7] - - [4096, 1024, 1, 2985] - - [326, 9837.3] + - [555, 9837.3] - - [1024, 4096, 1, 3371] - - [319, 9913.03] + - [548, 9913.03] - - [4096, 1024, 1, 3342] - - [326, 9863.16] + - [555, 9863.16] - - [4096, 1024, 1, 3141] - - [320, 9849.91] + - [549, 9849.91] - - [4096, 1024, 1, 3532] - - [320, 9866.3] + - [549, 9866.3] - - [64, 78, 816, 78] - - [297, 5316.88] + - [526, 5316.88] - - [1024, 4096, 1, 3169] - - [327, 9910.45] + - [556, 9910.45] - - [1024, 4096, 1, 3514] - - [326, 9918.0] + - [555, 9918.0] - - [4096, 1024, 1, 3780] - - [327, 9899.75] + - [556, 9899.75] - - [1024, 4096, 1, 3098] - - [319, 9901.62] + - [548, 9901.62] - - [1024, 4096, 1, 3449] - - [327, 9919.85] + - [556, 9919.85] - - [1024, 4096, 1, 3222] - - [319, 9917.66] + - [548, 9917.66] - - [1024, 4096, 1, 3346] - - [320, 9912.91] + - [549, 9912.91] - - [4096, 1024, 1, 3064] - - [327, 9848.79] + - [556, 9848.79] - - [4096, 1024, 1, 3511] - - [326, 9873.39] + - [555, 9873.39] - - [4096, 1024, 1, 3384] - - [326, 9870.98] + - [555, 9870.98] - - [4096, 1024, 1, 3356] - - [320, 9853.45] + - [549, 9853.45] - - [1024, 4096, 1, 3796] - - [319, 9940.66] + - [548, 9940.66] - - [4096, 1024, 1, 3427] - - [326, 9883.14] + - [555, 9883.14] - - [4096, 1024, 1, 3390] - - [326, 9863.79] + - [555, 9863.79] - - [4096, 1024, 1, 3573] - - [327, 9886.02] + - [556, 9886.02] - - [4096, 1024, 1, 3456] - - [320, 9890.61] + - [549, 9890.61] - - [1024, 4096, 1, 3360] - - [327, 9938.1] + - [556, 9938.1] - - [1024, 33708, 1, 3977] - - [320, 10327.2] + - [549, 10327.2] - - [1024, 4096, 1, 2918] - - [319, 9902.84] + - [548, 9902.84] - - [4096, 1024, 1, 3975] - - [326, 9905.27] + - [555, 9905.27] - - [4096, 1024, 1, 3525] - - [327, 9879.91] + - [556, 9879.91] - - [4096, 1024, 1, 3398] - - [319, 9873.91] + - [548, 9873.91] - - [4096, 1024, 1, 3640] - - [326, 9885.16] + - [555, 9885.16] - - [1024, 1024, 1, 3999] - - [318, 8995.42] + - [547, 8995.42] - - [4096, 1024, 1, 3014] - - [326, 9841.32] + - [555, 9841.32] - - [1024, 4096, 1, 3446] - - [319, 9917.21] + - [548, 9917.21] - - [1024, 33708, 1, 3796] - - [319, 10339.0] + - [548, 10339.0] - - [4096, 1024, 1, 3101] - - [319, 9827.34] + - [548, 9827.34] - - [4096, 1024, 1, 3563] - - [327, 9863.03] + - [556, 9863.03] - - [4096, 1024, 1, 3539] - - [319, 9889.54] + - [548, 9889.54] - - [4096, 1024, 1, 3182] - - [326, 9833.79] + - [555, 9833.79] - - [1024, 4096, 1, 3468] - - [320, 9913.05] + - [549, 9913.05] - - [4096, 1024, 1, 3312] - - [326, 9889.85] + - [555, 9889.85] - - [4096, 1024, 1, 3215] - - [326, 9853.88] + - [555, 9853.88] - - [4096, 1024, 1, 3910] - - [326, 9894.72] + - [555, 9894.72] - - [1024, 33708, 1, 3780] - - [320, 10332.0] + - [549, 10332.0] - - [1024, 4096, 1, 3290] - - [326, 9915.08] + - [555, 9915.08] - - [1024, 4096, 1, 4012] - - [326, 9942.65] + - [555, 9942.65] - - [1024, 4096, 1, 3385] - - [326, 9915.83] + - [555, 9915.83] - - [1024, 33708, 1, 3975] - - [319, 10330.1] + - [548, 10330.1] - - [4096, 1024, 1, 3996] - - [326, 9891.31] + - [555, 9891.31] - - [4096, 1024, 1, 2765] - - [327, 9800.38] + - [556, 9800.38] - - [4096, 1024, 1, 3538] - - [327, 9886.22] + - [556, 9886.22] - - [4096, 1024, 1, 3415] - - [327, 9874.6] + - [556, 9874.6] - - [1024, 4096, 1, 3554] - - [326, 9931.99] + - [555, 9931.99] - - [4096, 1024, 1, 3513] - - [320, 9874.25] + - [549, 9874.25] - - [1024, 4096, 1, 3304] - - [320, 9907.73] + - [549, 9907.73] - - [4096, 1024, 1, 3294] - - [326, 9851.25] + - [555, 9851.25] - - [4096, 1024, 1, 3396] - - [327, 9880.7] + - [556, 9880.7] - - [1024, 4096, 1, 3213] - - [320, 9891.12] + - [549, 9891.12] - - [4096, 1024, 1, 3137] - - [320, 9857.41] + - [549, 9857.41] - - [4096, 1024, 1, 3552] - - [326, 9904.22] + - [555, 9904.22] - - [1024, 1024, 1, 4020] - - [325, 9098.87] + - [554, 9098.87] - - [64, 13, 4672, 13] - - [307, 1693.54] + - [536, 1693.54] - - [1024, 4096, 1, 3461] - - [326, 9918.45] + - [555, 9918.45] - - [4096, 1024, 1, 3263] - - [319, 9843.89] + - [548, 9843.89] - - [4096, 1024, 1, 3430] - - [326, 9885.26] + - [555, 9885.26] - - [4096, 1024, 1, 3389] - - [326, 9859.23] + - [555, 9859.23] - - [4096, 1024, 1, 3528] - - [326, 9873.01] + - [555, 9873.01] - - [1024, 4096, 1, 3463] - - [327, 9929.61] + - [556, 9929.61] - - [4096, 1024, 1, 3526] - - [327, 9876.9] + - [556, 9876.9] - - [4096, 1024, 1, 3154] - - [326, 9858.25] + - [555, 9858.25] - - [4096, 1024, 1, 3499] - - [327, 9862.92] + - [556, 9862.92] - - [1024, 1024, 1, 3939] - - [325, 9107.41] + - [554, 9107.41] - - [4096, 1024, 1, 3955] - - [327, 9906.28] + - [556, 9906.28] - - [1024, 4096, 1, 3297] - - [320, 9925.34] + - [549, 9925.34] - - [1024, 4096, 1, 3233] - - [326, 9920.65] + - [555, 9920.65] - - [1024, 4096, 1, 3226] - - [326, 9911.35] + - [555, 9911.35] - - [4096, 1024, 1, 3404] - - [326, 9867.28] + - [555, 9867.28] - - [4096, 1024, 1, 3355] - - [326, 9862.66] + - [555, 9862.66] - - [1024, 4096, 1, 3542] - - [326, 9926.49] + - [555, 9926.49] - - [4096, 1024, 1, 3181] - - [327, 9831.86] + - [556, 9831.86] - - [1024, 4096, 1, 3474] - - [326, 9928.03] + - [555, 9928.03] - - [4096, 1024, 1, 3319] - - [326, 9870.28] + - [555, 9870.28] - - [1024, 4096, 1, 3434] - - [319, 9917.51] + - [548, 9917.51] - - [1024, 4096, 1, 3860] - - [326, 9945.32] + - [555, 9945.32] - - [1024, 4096, 1, 3343] - - [319, 9914.66] + - [548, 9914.66] - - [64, 77, 816, 78] - - [297, 5276.97] + - [526, 5276.97] - - [1024, 4096, 1, 3488] - - [326, 9945.81] + - [555, 9945.81] - - [1024, 4096, 1, 3046] - - [326, 9908.78] + - [555, 9908.78] - - [1024, 4096, 1, 3141] - - [327, 9909.18] + - [556, 9909.18] - - [1024, 4096, 1, 3516] - - [327, 9911.38] + - [556, 9911.38] - - [4096, 1024, 1, 3147] - - [326, 9840.47] + - [555, 9840.47] - - [1024, 1024, 1, 4059] - - [318, 9009.78] + - [547, 9009.78] - - [1024, 1024, 1, 3944] - - [318, 9006.17] + - [547, 9006.17] - - [1024, 4096, 1, 3421] - - [327, 9919.86] + - [556, 9919.86] - - [4096, 1024, 1, 3944] - - [320, 9899.53] + - [549, 9899.53] - - [64, 45, 1424, 45] - - [305, 4068.67] + - [534, 4068.67] - - [1024, 4096, 1, 3574] - - [320, 9930.19] + - [549, 9930.19] - - [1024, 4096, 1, 3977] - - [319, 9944.28] + - [548, 9944.28] - - [1024, 1024, 1, 3968] - - [324, 9045.22] + - [553, 9045.22] - - [1024, 4096, 1, 2985] - - [326, 9887.65] + - [555, 9887.65] - - [64, 193, 320, 193] - - [313, 6631.35] + - [542, 6631.35] - - [1024, 4096, 1, 3427] - - [327, 9933.41] + - [556, 9933.41] - - [64, 12, 5040, 12] - - [307, 1552.53] + - [536, 1552.53] - - [1024, 4096, 1, 3482] - - [327, 9942.22] + - [556, 9942.22] - - [1024, 4096, 1, 3332] - - [319, 9923.58] + - [548, 9923.58] - - [1024, 1024, 1, 3720] - - [323, 9039.56] + - [552, 9039.56] - - [4096, 1024, 1, 3308] - - [327, 9852.66] + - [556, 9852.66] - - [1024, 4096, 1, 3513] - - [327, 9919.99] + - [556, 9919.99] - - [1024, 4096, 1, 3154] - - [320, 9908.46] + - [549, 9908.46] - - [1024, 4096, 1, 3955] - - [327, 9950.01] + - [556, 9950.01] - - [1024, 4096, 1, 2967] - - [327, 9897.44] + - [556, 9897.44] - - [1024, 33708, 1, 3942] - - [319, 10336.1] + - [548, 10336.1] - - [1024, 4096, 1, 3319] - - [327, 9912.45] + - [556, 9912.45] - - [4096, 1024, 1, 3860] - - [326, 9909.29] + - [555, 9909.29] - - [1024, 4096, 1, 3548] - - [319, 9924.21] + - [548, 9924.21] - - [4096, 1024, 1, 3977] - - [327, 9891.44] + - [556, 9891.44] - - [4096, 1024, 1, 3535] - - [326, 9867.84] + - [555, 9867.84] - - [1024, 4096, 1, 3541] - - [327, 9923.16] + - [556, 9923.16] - - [1024, 1024, 1, 3910] - - [324, 9080.4] + - [553, 9080.4] - - [1024, 33708, 1, 3584] - - [319, 10333.0] + - [548, 10333.0] - - [1024, 4096, 1, 3168] - - [320, 9926.27] + - [549, 9926.27] - - [1024, 4096, 1, 3448] - - [327, 9922.42] + - [556, 9922.42] - - [4096, 1024, 1, 3343] - - [326, 9857.23] + - [555, 9857.23] - - [64, 35, 1808, 35] - - [309, 3175.44] + - [538, 3175.44] - - [1024, 4096, 1, 3357] - - [320, 9902.41] + - [549, 9902.41] - - [64, 143, 432, 143] - - [310, 6489.7] + - [539, 6489.7] - - [4096, 1024, 1, 3510] - - [326, 9867.4] + - [555, 9867.4] - - [4096, 1024, 1, 3369] - - [326, 9863.44] + - [555, 9863.44] - - [64, 92, 688, 93] - - [297, 6188.3] + - [526, 6188.3] - - [4096, 1024, 1, 3379] - - [326, 9870.12] + - [555, 9870.12] - - [1024, 4096, 1, 3276] - - [326, 9904.77] + - [555, 9904.77] - - [1024, 4096, 1, 3363] - - [326, 9925.13] + - [555, 9925.13] - - [4096, 1024, 1, 3055] - - [326, 9831.92] + - [555, 9831.92] - - [1024, 4096, 1, 3524] - - [319, 9923.79] + - [548, 9923.79] - - [4096, 1024, 1, 3057] - - [326, 9852.87] + - [555, 9852.87] - - [1024, 33708, 1, 3720] - - [320, 10327.1] + - [549, 10327.1] - - [1024, 4096, 1, 3383] - - [319, 9919.39] + - [548, 9919.39] - - [1024, 4096, 1, 3522] - - [320, 9932.56] + - [549, 9932.56] - - [1024, 33708, 1, 3956] - - [319, 10333.8] + - [548, 10333.8] - - [1024, 4096, 1, 3481] - - [319, 9922.08] + - [548, 9922.08] - - [4096, 1024, 1, 3562] - - [327, 9874.86] + - [556, 9874.86] - - [4096, 1024, 1, 3299] - - [326, 9872.97] + - [555, 9872.97] - - [1024, 4096, 1, 3262] - - [320, 9924.83] + - [549, 9924.83] - - [1024, 4096, 1, 3840] - - [319, 9961.84] + - [548, 9961.84] - - [1024, 33708, 1, 4026] - - [319, 10334.3] + - [548, 10334.3] - - [4096, 1024, 1, 3168] - - [320, 9878.45] + - [549, 9878.45] - - [64, 101, 624, 101] - - [300, 5734.72] + - [529, 5734.72] - - [1024, 4096, 1, 3999] - - [319, 9947.1] + - [548, 9947.1] - - [1024, 4096, 1, 3549] - - [319, 9923.3] + - [548, 9923.3] - - [4096, 1024, 1, 3375] - - [326, 9868.89] + - [555, 9868.89] - - [1024, 4096, 1, 3496] - - [327, 9928.67] + - [556, 9928.67] - - [64, 29, 2176, 29] - - [296, 3290.02] + - [525, 3290.02] - - [1024, 4096, 1, 3190] - - [327, 9897.61] + - [556, 9897.61] - - [4096, 1024, 1, 3273] - - [327, 9853.65] + - [556, 9853.65] - - [1024, 4096, 1, 3406] - - [326, 9907.04] + - [555, 9907.04] - - [4096, 1024, 1, 4005] - - [319, 9907.97] + - [548, 9907.97] - - [4096, 1024, 1, 3555] - - [326, 9878.96] + - [555, 9878.96] - - [4096, 1024, 1, 2505] - - [326, 9785.1] + - [555, 9785.1] - - [1024, 4096, 1, 3460] - - [326, 9930.24] + - [555, 9930.24] - - [64, 17, 3632, 17] - - [297, 1917.27] + - [526, 1917.27] - - [1024, 4096, 1, 3579] - - [320, 9920.94] + - [549, 9920.94] - - [1024, 33708, 1, 4030] - - [320, 10327.7] + - [549, 10327.7] - - [1024, 4096, 1, 3510] - - [320, 9931.31] + - [549, 9931.31] - - [1024, 1024, 1, 3969] - - [317, 9020.83] + - [546, 9020.83] - - [1024, 4096, 1, 3282] - - [327, 9920.05] + - [556, 9920.05] - - [1024, 4096, 1, 3377] - - [319, 9927.34] + - [548, 9927.34] - - [1024, 4096, 1, 2935] - - [327, 9903.48] + - [556, 9903.48] - - [64, 41, 1552, 41] - - [297, 3740.48] + - [526, 3740.48] - - [1024, 4096, 1, 3498] - - [319, 9915.01] + - [548, 9915.01] - - [1024, 4096, 1, 3593] - - [326, 9925.64] + - [555, 9925.64] - - [1024, 1024, 1, 3948] - - [325, 9009.03] + - [554, 9009.03] - - [4096, 1024, 1, 3226] - - [327, 9854.75] + - [556, 9854.75] - - [1024, 4096, 1, 2499] - - [326, 9904.82] + - [555, 9904.82] - - [1024, 4096, 1, 3296] - - [319, 9926.89] + - [548, 9926.89] - - [1024, 4096, 1, 3455] - - [326, 9917.52] + - [555, 9917.52] - - [1024, 4096, 1, 3399] - - [320, 9919.7] + - [549, 9919.7] - - [1024, 4096, 1, 3205] - - [319, 9917.74] + - [548, 9917.74] - - [4096, 1024, 1, 4026] - - [327, 9897.81] + - [556, 9897.81] - - [1024, 4096, 1, 3484] - - [319, 9915.53] + - [548, 9915.53] - - [4096, 1024, 1, 3302] - - [327, 9862.8] + - [556, 9862.8] - - [1024, 4096, 1, 3485] - - [327, 9913.0] + - [556, 9913.0] - - [1024, 1024, 1, 3996] - - [325, 9008.77] + - [554, 9008.77] - - [1024, 4096, 1, 3126] - - [320, 9910.16] + - [549, 9910.16] - - [1024, 4096, 1, 4050] - - [319, 9951.21] + - [548, 9951.21] - - [4096, 1024, 1, 3235] - - [320, 9870.74] + - [549, 9870.74] - - [1024, 33708, 1, 3955] - - [319, 10336.1] + - [548, 10336.1] - - [1024, 4096, 1, 3342] - - [319, 9903.85] + - [548, 9903.85] - - [1024, 1024, 1, 3900] - - [324, 9082.92] + - [553, 9082.92] - - [1024, 4096, 1, 3397] - - [327, 9922.7] + - [556, 9922.7] - - [4096, 1024, 1, 3491] - - [327, 9880.75] + - [556, 9880.75] - - [1024, 4096, 1, 3503] - - [319, 9923.28] + - [548, 9923.28] - - [1024, 4096, 1, 3140] - - [320, 9908.41] + - [549, 9908.41] - - [4096, 1024, 1, 3121] - - [326, 9860.32] + - [555, 9860.32] - - [4096, 1024, 1, 3276] - - [326, 9854.19] + - [555, 9854.19] - - [1024, 4096, 1, 3321] - - [327, 9917.86] + - [556, 9917.86] - - [1024, 4096, 1, 3870] - - [327, 9931.07] + - [556, 9931.07] - - [4096, 1024, 1, 3475] - - [326, 9877.58] + - [555, 9877.58] - - [1024, 4096, 1, 2984] - - [326, 9895.59] + - [555, 9895.59] - - [4096, 1024, 1, 3363] - - [320, 9873.44] + - [549, 9873.44] - - [1024, 4096, 1, 3582] - - [326, 9920.87] + - [555, 9920.87] - - [4096, 1024, 1, 3509] - - [326, 9886.86] + - [555, 9886.86] - - [1024, 4096, 1, 3426] - - [319, 9928.86] + - [548, 9928.86] - - [4096, 1024, 1, 3136] - - [326, 9872.61] + - [555, 9872.61] - - [1024, 4096, 1, 3232] - - [327, 9926.29] + - [556, 9926.29] - - [4096, 1024, 1, 3103] - - [326, 9839.03] + - [555, 9839.03] - - [1024, 4096, 1, 3335] - - [320, 9913.37] + - [549, 9913.37] - - [1024, 4096, 1, 3900] - - [319, 9938.01] + - [548, 9938.01] - - [4096, 1024, 1, 3512] - - [320, 9877.26] + - [549, 9877.26] - - [4096, 1024, 1, 3222] - - [326, 9859.77] + - [555, 9859.77] - - [1024, 4096, 1, 3165] - - [326, 9899.71] + - [555, 9899.71] - - [4096, 1024, 1, 3408] - - [326, 9899.68] + - [555, 9899.68] - - [4096, 1024, 1, 3751] - - [326, 9891.49] + - [555, 9891.49] - - [1024, 4096, 1, 3318] - - [319, 9913.42] + - [548, 9913.42] - - [4096, 1024, 1, 3442] - - [327, 9880.21] + - [556, 9880.21] - - [1024, 4096, 1, 3413] - - [326, 9921.9] + - [555, 9921.9] - - [4096, 1024, 1, 3524] - - [326, 9879.22] + - [555, 9879.22] - - [1024, 4096, 1, 3976] - - [327, 9945.57] + - [556, 9945.57] - - [1024, 4096, 1, 3475] - - [327, 9932.51] + - [556, 9932.51] - - [1024, 4096, 1, 3534] - - [319, 9911.49] + - [548, 9911.49] - - [4096, 1024, 1, 3301] - - [326, 9872.75] + - [555, 9872.75] - - [4096, 1024, 1, 3248] - - [326, 9878.22] + - [555, 9878.22] - - [1024, 4096, 1, 2977] - - [320, 9899.93] + - [549, 9899.93] - - [4096, 1024, 1, 3346] - - [326, 9876.07] + - [555, 9876.07] - - [1024, 4096, 1, 3451] - - [319, 9920.16] + - [548, 9920.16] - - [1024, 4096, 1, 3257] - - [320, 9905.02] + - [549, 9905.02] - - [1024, 1024, 1, 3640] - - [318, 8983.39] + - [547, 8983.39] - - [1024, 4096, 1, 3356] - - [319, 9904.48] + - [548, 9904.48] - - [4096, 1024, 1, 3348] - - [327, 9872.53] + - [556, 9872.53] - - [4096, 1024, 1, 3335] - - [326, 9865.82] + - [555, 9865.82] - - [4096, 1024, 1, 3505] - - [326, 9888.88] + - [555, 9888.88] - - [1024, 4096, 1, 3490] - - [319, 9938.0] + - [548, 9938.0] - - [4096, 1024, 1, 3447] - - [326, 9865.39] + - [555, 9865.39] - - [1024, 4096, 1, 3267] - - [327, 9919.32] + - [556, 9919.32] - - [4096, 1024, 1, 3230] - - [326, 9853.2] + - [555, 9853.2] - - [4096, 1024, 1, 3455] - - [326, 9862.44] + - [555, 9862.44] - - [1024, 4096, 1, 3925] - - [319, 9945.64] + - [548, 9945.64] - - [1024, 4096, 1, 3362] - - [320, 9921.63] + - [549, 9921.63] - - [4096, 1024, 1, 3969] - - [327, 9911.98] + - [556, 9911.98] - - [4096, 1024, 1, 3527] - - [326, 9882.87] + - [555, 9882.87] - - [1024, 4096, 1, 3585] - - [320, 9946.52] + - [549, 9946.52] - - [4096, 1024, 1, 3063] - - [326, 9854.03] + - [555, 9854.03] - - [4096, 1024, 1, 3435] - - [326, 9867.13] + - [555, 9867.13] - - [4096, 1024, 1, 3366] - - [327, 9864.02] + - [556, 9864.02] - - [4096, 1024, 1, 3581] - - [319, 9868.57] + - [548, 9868.57] - - [1024, 33708, 1, 3906] - - [319, 10339.3] + - [548, 10339.3] - - [1024, 4096, 1, 3464] - - [327, 9916.21] + - [556, 9916.21] - - [1024, 4096, 1, 3440] - - [326, 9945.25] + - [555, 9945.25] - - [4096, 1024, 1, 3143] - - [326, 9846.76] + - [555, 9846.76] - - [1024, 4096, 1, 3349] - - [320, 9912.83] + - [549, 9912.83] - - [4096, 1024, 1, 3416] - - [326, 9885.13] + - [555, 9885.13] - - [4096, 1024, 1, 3365] - - [326, 9876.0] + - [555, 9876.0] - - [1024, 4096, 1, 3470] - - [327, 9914.98] + - [556, 9914.98] - - [4096, 1024, 1, 3287] - - [326, 9860.69] + - [555, 9860.69] - - [1024, 4096, 1, 3441] - - [327, 9928.98] + - [556, 9928.98] - - [4096, 1024, 1, 3224] - - [326, 9857.83] + - [555, 9857.83] - - [1024, 4096, 1, 3387] - - [319, 9911.72] + - [548, 9911.72] - - [1024, 4096, 1, 3547] - - [319, 9920.36] + - [548, 9920.36] - - [4096, 1024, 1, 3478] - - [320, 9882.9] + - [549, 9882.9] - - [4096, 1024, 1, 3548] - - [327, 9869.45] + - [556, 9869.45] - - [1024, 33708, 1, 4020] - - [319, 10345.3] + - [548, 10345.3] - - [4096, 1024, 1, 3320] - - [326, 9863.74] + - [555, 9863.74] - - [1024, 4096, 1, 3906] - - [326, 9942.67] + - [555, 9942.67] - - [4096, 1024, 1, 3796] - - [326, 9899.13] + - [555, 9899.13] - - [1024, 4096, 1, 3306] - - [319, 9902.4] + - [548, 9902.4] - - [1024, 4096, 1, 3401] - - [327, 9913.95] + - [556, 9913.95] - - [64, 147, 432, 147] - - [310, 6626.6] + - [539, 6626.6] - - [1024, 4096, 1, 3215] - - [327, 9911.24] + - [556, 9911.24] - - [4096, 1024, 1, 4012] - - [327, 9898.2] + - [556, 9898.2] - - [1024, 4096, 1, 2765] - - [327, 9863.73] + - [556, 9863.73] - - [4096, 1024, 1, 3554] - - [320, 9883.52] + - [549, 9883.52] - - [4096, 1024, 1, 3423] - - [326, 9866.72] + - [555, 9866.72] - - [1024, 1024, 1, 3751] - - [324, 9006.36] + - [553, 9006.36] - - [1024, 4096, 1, 3562] - - [320, 9922.08] + - [549, 9922.08] - - [1024, 4096, 1, 3489] - - [319, 9936.78] + - [548, 9936.78] - - [4096, 1024, 1, 3358] - - [326, 9858.22] + - [555, 9858.22] - - [4096, 1024, 1, 3270] - - [327, 9850.84] + - [556, 9850.84] - - [1024, 4096, 1, 3293] - - [319, 9905.33] + - [548, 9905.33] - - [1024, 4096, 1, 3376] - - [319, 9934.98] + - [548, 9934.98] - - [4096, 1024, 1, 3245] - - [326, 9852.52] + - [555, 9852.52] - - [4096, 1024, 1, 3541] - - [326, 9887.22] + - [555, 9887.22] - - [4096, 1024, 1, 3443] - - [326, 9871.73] + - [555, 9871.73] - - [4096, 1024, 1, 3438] - - [327, 9863.86] + - [556, 9863.86] - - [4096, 1024, 1, 3244] - - [326, 9859.76] + - [555, 9859.76] - - [1024, 4096, 1, 3365] - - [326, 9922.1] + - [555, 9922.1] - - [1024, 4096, 1, 3299] - - [320, 9923.38] + - [549, 9923.38] - - [4096, 1024, 1, 3840] - - [326, 9914.75] + - [555, 9914.75] - - [1024, 4096, 1, 3471] - - [327, 9918.38] + - [556, 9918.38] - - [1024, 4096, 1, 3398] - - [319, 9918.99] + - [548, 9918.99] - - [4096, 1024, 1, 3162] - - [326, 9843.93] + - [555, 9843.93] - - [1024, 4096, 1, 4005] - - [320, 9947.87] + - [549, 9947.87] - - [4096, 1024, 1, 3579] - - [326, 9868.25] + - [555, 9868.25] - - [64, 18, 3440, 18] - - [302, 2059.33] + - [531, 2059.33] - - [64, 177, 352, 177] - - [321, 7315.4] + - [550, 7315.4] - - [1024, 4096, 1, 3121] - - [327, 9930.34] + - [556, 9930.34] - - [4096, 1024, 1, 3441] - - [326, 9883.28] + - [555, 9883.28] - - [4096, 1024, 1, 3422] - - [326, 9858.41] + - [555, 9858.41] - - [4096, 1024, 1, 3444] - - [326, 9887.03] + - [555, 9887.03] - - [1024, 4096, 1, 3337] - - [320, 9911.45] + - [549, 9911.45] - - [4096, 1024, 1, 3550] - - [319, 9871.87] + - [548, 9871.87] - - [1024, 4096, 1, 3477] - - [319, 9930.65] + - [548, 9930.65] - - [4096, 1024, 1, 3490] - - [326, 9878.45] + - [555, 9878.45] - - [4096, 1024, 1, 3585] - - [326, 9893.63] + - [555, 9893.63] - - [1024, 4096, 1, 3143] - - [319, 9901.19] + - [548, 9901.19] - - [1024, 33708, 1, 3876] - - [320, 10330.8] + - [549, 10330.8] - - [1024, 4096, 1, 3320] - - [327, 9913.18] + - [556, 9913.18] - - [1024, 4096, 1, 3423] - - [327, 9914.14] + - [556, 9914.14] - - [1024, 4096, 1, 3894] - - [319, 9944.47] + - [548, 9944.47] - - [4096, 1024, 1, 3410] - - [326, 9878.67] + - [555, 9878.67] - - [1024, 4096, 1, 3561] - - [319, 9926.68] + - [548, 9926.68] - - [4096, 1024, 1, 3492] - - [320, 9872.92] + - [549, 9872.92] - - [64, 85, 752, 85] - - [297, 5734.35] + - [526, 5734.35] - - [36548, 1024, 1, 3712] - - [329, 10367.6] + - [558, 10367.6] - - [4096, 2048, 1, 128] - - [330, 8743.93] + - [559, 8743.93] - - [1024, 1024, 1, 3712] - - [331, 9976.29] + - [560, 9976.29] - - [1024, 1024, 1, 128] - - [328, 5765.47] + - [557, 5765.47] - - [4096, 3072, 1, 128] - - [330, 8869.11] + - [559, 8869.11] + - - [768, 3072, 1, 4096] + - [571, 10028.8] + - - [64, 256, 192, 256] + - [565, 8791.65] + - - [768, 2, 1, 16] + - [568, 5.05484] + - - [768, 768, 1, 64] + - [564, 3469.65] + - - [768, 768, 1, 4096] + - [572, 7475.1] + - - [768, 30522, 1, 1280] + - [575, 10297.0] + - - [64, 128, 384, 128] + - [565, 7660.93] + - - [768, 30522, 1, 320] + - [573, 10008.0] + - - [768, 768, 1, 32] + - [562, 2359.4] + - - [3072, 768, 1, 4096] + - [571, 10033.8] + - - [768, 30522, 1, 640] + - [574, 10206.8] + - - [64, 64, 768, 64] + - [563, 5494.82] + - - [768, 768, 1, 640] + - [572, 6721.74] + - - [768, 768, 1, 16] + - [561, 1203.82] + - - [768, 768, 1, 1280] + - [570, 7138.67] + - - [768, 2, 1, 32] + - [566, 11.9154] + - - [2048, 2048, 1, 512] + - [586, 9607.67] + - - [512, 32, 1, 200] + - [579, 422.368] + - - [1024, 1, 1, 200] + - [582, 24.7154] + - - [1600, 1024, 1, 512] + - [577, 8116.01] + - - [560, 1024, 1, 200] + - [576, 4810.84] + - - [1024, 1024, 1, 512] + - [585, 8614.84] + - - [2048, 1, 1, 512] + - [580, 81.0086] + - - [512, 512, 1, 200] + - [578, 4398.49] + - - [100, 2048, 1, 512] + - [583, 4443.22] + - - [1024, 1024, 1, 200] + - [584, 6990.61] + - - [1024, 64, 1, 512] + - [581, 2853.37] + - - [1024, 256, 1, 18944] + - [605, 9196.51] + - - [256, 3328, 1, 8976] + - [595, 8299.36] + - - [1024, 256, 1, 4352] + - [603, 8813.84] + - - [256, 9728, 1, 8976] + - [598, 9638.58] + - - [1024, 256, 1, 3072] + - [605, 8640.73] + - - [768, 2048, 1, 256] + - [597, 8663.03] + - - [1024, 256, 1, 19968] + - [602, 9220.96] + - - [256, 12800, 1, 8976] + - [592, 9418.52] + - - [1024, 256, 1, 3328] + - [606, 8682.58] + - - [256, 10240, 1, 8976] + - [599, 10137.8] + - - [1024, 256, 1, 15104] + - [604, 9167.13] + - - [256, 10496, 1, 8976] + - [592, 9858.48] + - - [1024, 256, 1, 2816] + - [607, 8575.81] + - - [1024, 256, 1, 4608] + - [602, 8861.31] + - - [256, 11264, 1, 8976] + - [589, 9627.79] + - - [1024, 256, 1, 6400] + - [602, 8985.33] + - - [1024, 256, 1, 16128] + - [602, 9170.36] + - - [256, 44505, 1, 8976] + - [596, 10331.9] + - - [256, 6144, 1, 8976] + - [599, 10395.1] + - - [1024, 256, 1, 5120] + - [604, 8881.63] + - - [1024, 256, 1, 7936] + - [607, 9023.24] + - - [256, 3840, 1, 8976] + - [594, 9541.38] + - - [1024, 256, 1, 21248] + - [602, 9209.82] + - - [1024, 256, 1, 12032] + - [604, 9156.27] + - - [256, 8192, 1, 8976] + - [601, 10374.5] + - - [1024, 256, 1, 3584] + - [603, 8712.3] + - - [1024, 256, 1, 14336] + - [604, 9162.61] + - - [256, 7168, 1, 8976] + - [590, 9554.96] + - - [1024, 256, 1, 13568] + - [602, 9165.14] + - - [256, 4096, 1, 8976] + - [594, 10146.7] + - - [1024, 256, 1, 4096] + - [603, 8783.98] + - - [256, 2560, 1, 8976] + - [593, 8381.66] + - - [256, 20992, 1, 8976] + - [592, 9989.96] + - - [256, 4352, 1, 8976] + - [593, 9635.02] + - - [256, 33536, 1, 8976] + - [592, 10218.2] + - - [256, 3584, 1, 8976] + - [594, 8924.6] + - - [256, 26112, 1, 8976] + - [593, 10272.4] + - - [256, 14336, 1, 8976] + - [597, 10217.4] + - - [1024, 256, 1, 14848] + - [604, 9185.29] + - - [1024, 256, 1, 8448] + - [605, 9025.99] + - - [1024, 256, 1, 28672] + - [602, 9256.5] + - - [1024, 256, 1, 5632] + - [602, 8932.79] + - - [256, 22016, 1, 8976] + - [597, 10152.0] + - - [1024, 256, 1, 33536] + - [602, 9243.17] + - - [256, 5120, 1, 8976] + - [588, 9418.15] + - - [256, 11520, 1, 8976] + - [595, 9701.1] + - - [256, 19968, 1, 8976] + - [593, 10228.1] + - - [1024, 256, 1, 5376] + - [604, 8892.62] + - - [1024, 256, 1, 22016] + - [602, 9244.34] + - - [256, 8960, 1, 8976] + - [593, 9841.41] + - - [1024, 256, 1, 15872] + - [602, 9223.25] + - - [256, 17408, 1, 8976] + - [597, 9785.87] + - - [256, 5632, 1, 8976] + - [597, 9564.32] + - - [256, 32512, 1, 8976] + - [596, 10358.0] + - - [256, 11008, 1, 8976] + - [589, 9445.23] + - - [1024, 256, 1, 6144] + - [604, 8955.91] + - - [256, 4864, 1, 8976] + - [589, 8979.45] + - - [256, 15104, 1, 8976] + - [592, 10007.1] + - - [1024, 256, 1, 9984] + - [602, 9110.53] + - - [256, 1280, 1, 8976] + - [588, 5944.44] + - - [1024, 256, 1, 1024] + - [604, 7005.2] + - - [1024, 256, 1, 9728] + - [604, 9066.29] + - - [1024, 256, 1, 10496] + - [602, 9118.15] + - - [256, 11776, 1, 8976] + - [599, 9911.74] + - - [256, 12544, 1, 8976] + - [592, 9235.35] + - - [1024, 256, 1, 17152] + - [602, 9152.31] + - - [1024, 256, 1, 11520] + - [604, 9146.87] + - - [1024, 256, 1, 21504] + - [604, 9207.52] + - - [256, 17152, 1, 8976] + - [591, 9654.81] + - - [1024, 256, 1, 17408] + - [602, 9181.27] + - - [256, 15872, 1, 8976] + - [600, 10086.5] + - - [256, 18688, 1, 8976] + - [593, 9612.57] + - - [256, 5888, 1, 8976] + - [597, 9988.43] + - - [512, 2048, 1, 256] + - [587, 7678.46] + - - [1024, 256, 1, 7680] + - [605, 9033.06] + - - [1024, 256, 1, 1280] + - [607, 7767.33] + - - [256, 14848, 1, 8976] + - [593, 9852.76] + - - [256, 9984, 1, 8976] + - [599, 9908.97] + - - [256, 20480, 1, 8976] + - [597, 10337.2] + - - [1024, 256, 1, 8192] + - [604, 9044.42] + - - [1024, 256, 1, 19712] + - [603, 9184.28] + - - [256, 13568, 1, 8976] + - [593, 9927.92] + - - [256, 13312, 1, 8976] + - [592, 9758.01] + - - [256, 2816, 1, 8976] + - [592, 9191.53] + - - [1024, 256, 1, 2304] + - [603, 8445.01] + - - [256, 21248, 1, 8976] + - [593, 10127.6] + - - [256, 16128, 1, 8976] + - [601, 10238.5] + - - [256, 512, 36, 98] + - [624, 7994.95] + - - [64, 192, 36, 25088] + - [693, 8613.99] + - - [128, 128, 64, 25] + - [623, 2540.25] + - - [256, 256, 64, 56] + - [624, 6924.66] + - - [512, 486, 36, 800] + - [631, 8994.94] + - - [512, 512, 36, 1568] + - [642, 9872.48] + - - [64, 192, 64, 3200] + - [687, 9295.99] + - - [256, 384, 36, 4096] + - [687, 9334.71] + - - [128, 256, 64, 32] + - [626, 4280.0] + - - [64, 128, 64, 23104] + - [693, 10103.2] + - - [128, 256, 64, 9] + - [617, 1709.73] + - - [256, 512, 36, 784] + - [627, 9520.83] + - - [256, 324, 36, 32] + - [665, 4473.48] + - - [512, 512, 36, 33] + - [636, 5925.27] + - - [16, 32, 36, 5760] + - [640, 1448.9] + - - [192, 384, 64, 128] + - [687, 8618.53] + - - [512, 512, 64, 72] + - [643, 8260.22] + - - [128, 128, 64, 1600] + - [616, 9008.48] + - - [512, 512, 36, 128] + - [687, 8871.72] + - - [192, 384, 64, 2304] + - [616, 9657.26] + - - [384, 256, 64, 450] + - [652, 9539.03] + - - [3, 64, 36, 6272] + - [640, 509.884] + - - [3, 64, 64, 2888] + - [669, 708.721] + - - [384, 256, 64, 2304] + - [652, 10287.6] + - - [512, 512, 64, 144] + - [687, 9226.8] + - - [256, 256, 36, 6272] + - [627, 9607.38] + - - [80, 192, 64, 4608] + - [688, 7348.03] + - - [64, 64, 36, 3136] + - [675, 5959.15] + - - [256, 384, 64, 2304] + - [652, 10283.5] + - - [512, 512, 36, 66] + - [636, 7618.18] + - - [128, 256, 64, 800] + - [662, 9611.25] + - - [64, 128, 36, 30] + - [618, 1242.71] + - - [192, 256, 36, 512] + - [687, 8658.07] + - - [256, 512, 64, 200] + - [687, 9153.97] + - - [256, 512, 64, 25] + - [665, 5349.98] + - - [3, 64, 64, 46208] + - [668, 808.662] + - - [128, 256, 36, 1568] + - [660, 8528.72] + - - [64, 128, 64, 11552] + - [693, 9997.1] + - - [128, 192, 64, 946] + - [687, 9198.48] + - - [64, 192, 64, 12800] + - [648, 9000.76] + - - [224, 224, 64, 128] + - [625, 6312.17] + - - [128, 256, 64, 288] + - [687, 8697.97] + - - [64, 64, 64, 826] + - [630, 6650.31] + - - [256, 384, 64, 1152] + - [662, 10106.9] + - - [3, 64, 64, 92416] + - [668, 812.131] + - - [32, 32, 36, 43808] + - [609, 2813.19] + - - [160, 320, 64, 288] + - [619, 8090.96] + - - [1, 16, 36, 23040] + - [656, 42.7667] + - - [128, 256, 36, 128] + - [634, 6049.58] + - - [128, 128, 64, 3360] + - [687, 9200.06] + - - [128, 128, 64, 420] + - [687, 8131.6] + - - [64, 128, 64, 361] + - [624, 6938.08] + - - [512, 512, 36, 16] + - [680, 3797.76] + - - [384, 256, 36, 800] + - [621, 9151.75] + - - [192, 384, 36, 4096] + - [621, 8867.67] + - - [64, 64, 64, 1600] + - [673, 7931.84] + - - [256, 384, 64, 576] + - [653, 9745.9] + - - [512, 512, 64, 14] + - [636, 3638.28] + - - [512, 512, 36, 8] + - [611, 2279.61] + - - [512, 486, 64, 128] + - [627, 8337.93] + - - [1, 16, 64, 640] + - [661, 50.0512] + - - [64, 96, 64, 288] + - [686, 5708.07] + - - [96, 96, 36, 1568] + - [655, 6866.85] + - - [256, 256, 36, 128] + - [659, 7703.92] + - - [64, 128, 36, 53824] + - [647, 6331.41] + - - [256, 256, 36, 32] + - [643, 4648.96] + - - [192, 256, 64, 288] + - [687, 8987.89] + - - [256, 256, 36, 16] + - [657, 2912.81] + - - [128, 256, 36, 3200] + - [660, 8680.37] + - - [160, 320, 64, 512] + - [619, 8449.54] + - - [128, 160, 36, 512] + - [630, 7215.07] + - - [96, 96, 36, 2592] + - [625, 7104.89] + - - [64, 96, 64, 800] + - [655, 7268.42] + - - [147, 64, 36, 18816] + - [671, 7116.36] + - - [160, 320, 36, 512] + - [625, 7874.92] + - - [256, 512, 36, 4] + - [664, 1034.88] + - - [96, 128, 64, 946] + - [647, 7901.17] + - - [256, 324, 64, 1568] + - [652, 8589.63] + - - [128, 128, 64, 50] + - [643, 4070.66] + - - [35, 96, 36, 8960] + - [637, 4207.4] + - - [32, 64, 36, 43808] + - [678, 4390.91] + - - [160, 224, 36, 128] + - [625, 5447.02] + - - [64, 64, 64, 81] + - [650, 2391.28] + - - [256, 256, 36, 3200] + - [616, 9559.65] + - - [256, 256, 36, 210] + - [627, 8414.71] + - - [192, 384, 64, 576] + - [687, 9468.85] + - - [512, 512, 64, 800] + - [662, 10096.5] + - - [512, 24, 36, 800] + - [613, 4761.87] + - - [64, 64, 64, 13216] + - [674, 8491.51] + - - [192, 224, 64, 1152] + - [630, 8769.16] + - - [256, 256, 64, 1152] + - [652, 9988.19] + - - [512, 486, 64, 512] + - [662, 9254.77] + - - [128, 128, 36, 784] + - [625, 7468.16] + - - [256, 512, 64, 1600] + - [649, 10232.6] + - - [512, 512, 64, 9] + - [643, 2599.88] + - - [96, 128, 64, 288] + - [655, 6599.53] + - - [64, 96, 36, 512] + - [655, 5073.85] + - - [256, 512, 36, 1568] + - [687, 9637.91] + - - [128, 128, 64, 400] + - [687, 8192.1] + - - [128, 128, 64, 800] + - [687, 8716.44] + - - [96, 128, 36, 512] + - [675, 6757.03] + - - [16, 32, 36, 360] + - [638, 754.136] + - - [128, 256, 64, 3200] + - [652, 10222.6] + - - [96, 128, 64, 800] + - [655, 7968.0] + - - [256, 512, 64, 4] + - [617, 1098.09] + - - [256, 256, 64, 450] + - [662, 9347.55] + - - [64, 64, 64, 3200] + - [673, 8518.18] + - - [192, 224, 64, 128] + - [633, 7035.27] + - - [128, 128, 64, 288] + - [687, 7751.38] + - - [256, 256, 64, 72] + - [643, 7489.93] + - - [96, 208, 36, 512] + - [655, 6939.21] + - - [128, 256, 36, 3136] + - [630, 8669.43] + - - [64, 64, 36, 3520] + - [625, 6007.57] + - - [64, 128, 36, 1568] + - [688, 6897.8] + - - [160, 320, 64, 242] + - [614, 7873.27] + - - [192, 192, 36, 512] + - [625, 7707.42] + - - [512, 512, 36, 512] + - [687, 9582.52] + - - [1, 16, 64, 10240] + - [639, 71.4511] + - - [128, 128, 36, 512] + - [625, 7149.48] + - - [512, 512, 36, 256] + - [616, 9384.5] + - - [512, 512, 36, 1024] + - [610, 9777.99] + - - [96, 208, 64, 1152] + - [688, 7851.0] + - - [128, 192, 64, 3200] + - [616, 9490.92] + - - [256, 256, 36, 4096] + - [621, 9585.56] + - - [160, 160, 64, 288] + - [655, 7299.9] + - - [256, 256, 64, 896] + - [652, 9850.43] + - - [128, 256, 64, 242] + - [687, 8391.48] + - - [128, 128, 36, 440] + - [630, 6274.82] + - - [96, 128, 36, 1568] + - [675, 7875.13] + - - [192, 384, 36, 1024] + - [621, 8715.82] + - - [64, 96, 36, 10368] + - [692, 7478.69] + - - [128, 256, 64, 100] + - [636, 7085.07] + - - [112, 224, 36, 2048] + - [629, 7556.02] + - - [384, 256, 64, 1152] + - [652, 10102.4] + - - [192, 384, 36, 128] + - [687, 7543.14] + - - [128, 128, 36, 7040] + - [660, 7600.7] + - - [128, 256, 64, 1568] + - [652, 10006.0] + - - [128, 128, 36, 1568] + - [644, 7848.4] + - - [128, 256, 64, 72] + - [667, 6553.7] + - - [256, 256, 36, 12544] + - [681, 9365.14] + - - [256, 256, 36, 105] + - [643, 7286.16] + - - [128, 256, 36, 392] + - [630, 7625.79] + - - [64, 64, 64, 5408] + - [673, 8882.77] + - - [3, 64, 36, 25088] + - [640, 529.042] + - - [384, 256, 36, 1024] + - [687, 9182.85] + - - [35, 96, 36, 13440] + - [694, 4110.39] + - - [128, 256, 64, 1152] + - [652, 9804.97] + - - [256, 324, 64, 32] + - [665, 5043.73] + - - [160, 224, 64, 128] + - [679, 6046.25] + - - [192, 224, 36, 2592] + - [677, 8878.78] + - - [96, 96, 64, 1152] + - [655, 8035.55] + - - [32, 64, 36, 90] + - [612, 964.565] + - - [64, 128, 64, 2888] + - [627, 9047.33] + - - [256, 384, 36, 800] + - [687, 9154.12] + - - [512, 512, 64, 4] + - [684, 1233.72] + - - [192, 320, 36, 128] + - [624, 7388.29] + - - [64, 128, 36, 480] + - [688, 5653.37] + - - [192, 384, 64, 242] + - [687, 9080.09] + - - [256, 486, 64, 32] + - [680, 5909.28] + - - [147, 64, 64, 9702] + - [689, 7319.79] + - - [512, 512, 64, 64] + - [623, 8179.12] + - - [64, 192, 64, 3698] + - [616, 9287.99] + - - [73, 192, 64, 10439] + - [647, 6668.12] + - - [1, 16, 36, 1440] + - [663, 33.5452] + - - [128, 256, 36, 512] + - [630, 7989.25] + - - [512, 512, 64, 576] + - [662, 9951.99] + - - [64, 64, 36, 12544] + - [678, 5872.87] + - - [128, 128, 36, 880] + - [675, 7597.36] + - - [192, 224, 36, 128] + - [633, 6451.3] + - - [64, 64, 64, 800] + - [673, 6916.83] + - - [64, 128, 36, 12544] + - [651, 6395.98] + - - [64, 64, 36, 1568] + - [625, 5536.76] + - - [160, 160, 36, 512] + - [625, 7345.36] + - - [512, 24, 64, 512] + - [615, 5242.98] + - - [3, 64, 36, 3136] + - [640, 475.452] + - - [256, 256, 64, 9] + - [665, 2106.61] + - - [3, 64, 64, 11552] + - [668, 785.227] + - - [128, 256, 36, 12544] + - [683, 8792.23] + - - [128, 128, 36, 3136] + - [644, 8098.56] + - - [256, 512, 36, 3136] + - [627, 9694.49] + - - [64, 64, 36, 196] + - [641, 2757.86] + - - [144, 288, 36, 512] + - [675, 7077.99] + - - [256, 24, 64, 32] + - [654, 1483.93] + - - [384, 384, 36, 800] + - [616, 9246.6] + - - [512, 512, 64, 1600] + - [662, 10277.4] + - - [112, 224, 36, 512] + - [630, 6744.88] + - - [128, 128, 36, 49] + - [636, 2716.39] + - - [512, 512, 36, 4] + - [664, 1156.62] + - - [35, 96, 64, 4235] + - [625, 4631.38] + - - [192, 384, 64, 450] + - [616, 9372.3] + - - [256, 256, 36, 1024] + - [687, 9346.74] + - - [112, 224, 64, 1152] + - [630, 7524.05] + - - [256, 512, 64, 400] + - [649, 9598.05] + - - [149, 32, 36, 19072] + - [694, 5811.9] + - - [128, 256, 36, 6272] + - [630, 8754.78] + - - [128, 192, 36, 1568] + - [655, 8195.2] + - - [256, 256, 36, 512] + - [687, 9074.32] + - - [256, 256, 64, 112] + - [687, 8305.65] + - - [512, 512, 64, 18] + - [680, 4324.12] + - - [256, 256, 64, 18] + - [643, 3547.91] + - - [256, 256, 64, 1568] + - [652, 10141.8] + - - [64, 96, 36, 1568] + - [673, 6805.76] + - - [384, 256, 36, 4096] + - [687, 9311.2] + - - [256, 512, 64, 800] + - [662, 9998.45] + - - [256, 384, 36, 2048] + - [687, 9285.44] + - - [3, 64, 36, 200704] + - [669, 547.475] + - - [384, 384, 64, 2304] + - [610, 9901.78] + - - [160, 320, 64, 128] + - [646, 7113.91] + - - [512, 512, 36, 528] + - [616, 9567.75] + - - [160, 320, 36, 128] + - [647, 6411.23] + - - [96, 96, 64, 800] + - [655, 7690.11] + - - [256, 512, 36, 49] + - [643, 6721.35] + - - [384, 384, 64, 450] + - [616, 9523.63] + - - [3, 64, 64, 23104] + - [668, 801.721] + - - [256, 256, 64, 3200] + - [652, 10300.5] + - - [128, 192, 36, 512] + - [630, 7499.85] + - - [192, 192, 64, 288] + - [687, 8774.34] + - - [96, 208, 64, 242] + - [647, 5902.09] + - - [256, 16, 36, 3200] + - [676, 3807.87] + - - [512, 512, 64, 8] + - [654, 2379.85] + - - [64, 128, 64, 5776] + - [627, 9332.84] + - - [512, 512, 64, 288] + - [616, 9522.09] + - - [256, 16, 36, 32] + - [672, 766.105] + - - [128, 192, 64, 288] + - [687, 8527.68] + - - [32, 64, 64, 640] + - [655, 4660.44] + - - [64, 64, 36, 392] + - [655, 3686.5] + - - [384, 384, 36, 1024] + - [621, 9282.58] + - - [64, 64, 36, 11552] + - [685, 5904.88] + - - [96, 128, 36, 6272] + - [675, 8351.09] + - - [128, 256, 36, 16] + - [657, 2144.91] + - - [256, 256, 64, 288] + - [687, 9140.23] + - - [64, 64, 64, 1652] + - [673, 7766.63] + - - [256, 384, 36, 1024] + - [621, 9203.37] + - - [96, 128, 64, 3200] + - [690, 8866.3] + - - [256, 324, 36, 3200] + - [629, 8194.35] + - - [128, 192, 64, 800] + - [687, 9198.13] + - - [64, 128, 64, 10] + - [628, 851.217] + - - [96, 208, 64, 288] + - [655, 6667.68] + - - [64, 96, 36, 2592] + - [637, 7216.98] + - - [64, 128, 64, 160] + - [666, 5191.07] + - - [192, 384, 64, 512] + - [616, 9446.14] + - - [64, 64, 36, 6272] + - [625, 6212.11] + - - [512, 24, 36, 288] + - [622, 3922.57] + - - [128, 128, 64, 1568] + - [616, 9037.96] + - - [112, 224, 64, 242] + - [686, 6399.36] + - - [128, 256, 64, 1600] + - [652, 10010.4] + - - [32, 32, 64, 20000] + - [620, 4378.51] + - - [160, 192, 64, 288] + - [647, 7803.73] + - - [512, 24, 64, 128] + - [608, 3733.9] + - - [512, 512, 36, 32] + - [643, 5935.44] + - - [3, 64, 36, 100352] + - [640, 542.883] + - - [3, 64, 64, 1444] + - [669, 674.259] + - - [512, 512, 36, 3136] + - [610, 9921.2] + - - [128, 256, 64, 6400] + - [670, 10349.4] + - - [256, 256, 36, 2048] + - [687, 9519.09] + - - [128, 160, 64, 288] + - [630, 7549.85] + - - [256, 256, 64, 6400] + - [652, 10392.7] + - - [32, 64, 64, 20000] + - [678, 6493.96] + - - [256, 256, 36, 1680] + - [627, 9513.39] + - - [128, 128, 64, 210] + - [687, 7094.2] + - - [192, 384, 36, 2048] + - [616, 8818.75] + - - [256, 256, 64, 144] + - [687, 8608.71] + - - [384, 384, 36, 4096] + - [621, 9357.04] + - - [160, 320, 64, 1152] + - [647, 8749.58] + - - [384, 256, 36, 2048] + - [687, 9279.73] + - - [256, 512, 36, 392] + - [687, 9252.24] + - - [256, 512, 64, 50] + - [643, 7511.39] + - - [73, 192, 36, 23360] + - [691, 5803.03] + - - [3, 64, 36, 50176] + - [640, 542.137] + - - [384, 384, 36, 2048] + - [616, 9325.9] + - - [256, 384, 64, 450] + - [662, 9528.76] + - - [192, 320, 64, 128] + - [621, 8399.91] + - - [128, 256, 36, 32] + - [636, 3276.9] + - - [160, 192, 36, 512] + - [675, 7752.44] + - - [512, 512, 64, 256] + - [627, 9473.74] + - - [256, 512, 64, 32] + - [665, 6391.42] + - - [384, 384, 64, 576] + - [616, 9614.89] + - - [64, 64, 64, 648] + - [673, 6282.25] + - - [512, 486, 36, 288] + - [687, 8625.03] + - - [32, 64, 36, 1440] + - [625, 3961.6] + - - [144, 288, 64, 242] + - [647, 6347.12] + - - [384, 256, 64, 576] + - [652, 9775.34] + - - [512, 512, 36, 64] + - [623, 7791.38] + - - [448, 384, 64, 128] + - [616, 9132.33] + - - [64, 128, 64, 722] + - [666, 8047.21] + - - [144, 288, 64, 288] + - [675, 6859.5] + - - [512, 512, 64, 224] + - [687, 9427.39] + - - [112, 224, 64, 288] + - [686, 6737.02] + - - [384, 384, 64, 1152] + - [610, 9820.56] + - - [448, 384, 36, 128] + - [687, 8761.41] + - - [64, 64, 64, 100] + - [633, 2708.2] + - - [256, 486, 36, 128] + - [659, 7640.14] + - - [64, 96, 64, 4608] + - [688, 8351.59] + - - [16, 32, 64, 160] + - [612, 736.46] + - - [64, 192, 36, 6272] + - [688, 8041.29] + - - [64, 64, 64, 200] + - [641, 3924.41] + - - [256, 256, 36, 800] + - [687, 9299.65] + - - [64, 128, 36, 6272] + - [685, 6816.46] + - - [32, 64, 64, 40] + - [632, 885.722] + - - [256, 16, 64, 32] + - [682, 1205.36] + - - [192, 384, 36, 800] + - [621, 8673.98] + - - [128, 128, 36, 3200] + - [655, 8538.99] + - - [256, 256, 36, 256] + - [627, 8454.46] + - - [192, 384, 64, 1152] + - [616, 9589.11] + - - [128, 256, 64, 200] + - [626, 8141.22] + - - [64, 96, 64, 1152] + - [655, 7620.98] + - - [128, 128, 36, 392] + - [630, 6175.61] + - - [80, 192, 36, 10368] + - [678, 6497.26] + - - [224, 224, 36, 128] + - [688, 5826.99] + - - [512, 512, 64, 28] + - [643, 5728.91] + - - [256, 16, 64, 1568] + - [658, 4637.3] + - - [144, 288, 64, 1152] + - [675, 7784.34] + - - [256, 256, 64, 576] + - [652, 9596.22] + - - [64, 128, 36, 784] + - [688, 6059.09] + - - [256, 24, 36, 128] + - [622, 2239.94] + - - [256, 256, 64, 2304] + - [652, 10225.8] + - - [192, 384, 36, 512] + - [687, 8549.13] + - - [16, 32, 64, 2560] + - [640, 2153.23] + - - [256, 512, 36, 32] + - [665, 5702.33] + - - [512, 512, 64, 128] + - [687, 9084.21] + - - [128, 128, 64, 200] + - [624, 6972.01] + - - [512, 512, 64, 32] + - [636, 6248.6] + - - [128, 256, 36, 196] + - [636, 6628.86] + - - [8, 384, 64, 6600] + - [668, 2733.99] + - - [149, 32, 64, 8195] + - [630, 6051.01] + - - [35, 96, 64, 6160] + - [675, 4689.45] + - - [64, 64, 36, 1760] + - [625, 5622.34] + - - [196, 528, 32, 32] + - [708, 4088.51] + - - [5329, 64, 32, 80] + - [701, 8331.24] + - - [64, 2880, 1, 320] + - [752, 4362.7] + - - [49, 832, 32, 256] + - [715, 5618.73] + - - [3136, 64, 64, 64] + - [701, 8457.75] + - - [196, 512, 32, 24] + - [702, 3621.83] + - - [289, 1120, 1, 160] + - [698, 3302.96] + - - [1225, 192, 32, 32] + - [706, 6194.67] + - - [64, 2048, 32, 384] + - [729, 9541.64] + - - [1001, 1536, 1, 32] + - [700, 3575.77] + - - [289, 1792, 1, 320] + - [723, 5140.43] + - - [3136, 256, 64, 64] + - [724, 9310.22] + - - [1001, 1024, 1, 32] + - [695, 2733.5] + - - [196, 480, 32, 64] + - [756, 5070.52] + - - [64, 1728, 1, 320] + - [753, 3205.67] + - - [49, 832, 32, 160] + - [757, 4988.92] + - - [49, 2048, 64, 512] + - [727, 7370.41] + - - [49, 832, 32, 384] + - [715, 5902.05] + - - [289, 896, 1, 192] + - [741, 3452.69] + - - [289, 1024, 32, 384] + - [760, 8902.52] + - - [784, 192, 32, 96] + - [771, 7853.73] + - - [50176, 256, 1, 128] + - [734, 9041.93] + - - [289, 1024, 32, 256] + - [769, 8660.82] + - - [289, 1024, 32, 192] + - [758, 8433.45] + - - [12544, 512, 1, 256] + - [718, 9187.44] + - - [1225, 1728, 1, 192] + - [722, 7720.95] + - - [196, 480, 32, 96] + - [767, 5662.6] + - - [196, 512, 32, 144] + - [761, 6531.48] + - - [784, 400, 1, 32] + - [696, 1280.1] + - - [289, 768, 32, 128] + - [762, 7913.71] + - - [5329, 576, 1, 96] + - [705, 7563.56] + - - [49, 1200, 1, 128] + - [749, 1011.71] + - - [64, 1536, 32, 256] + - [763, 9159.64] + - - [289, 2592, 1, 384] + - [731, 6002.81] + - - [196, 528, 32, 128] + - [766, 5987.2] + - - [64, 2048, 32, 448] + - [729, 9669.97] + - - [196, 1024, 64, 256] + - [768, 7819.04] + - - [5329, 448, 1, 64] + - [701, 6201.12] + - - [784, 256, 32, 64] + - [703, 7623.28] + - - [784, 192, 32, 32] + - [708, 5874.36] + - - [21609, 288, 1, 32] + - [721, 5296.6] + - - [784, 256, 32, 32] + - [699, 6235.56] + - - [5041, 720, 1, 192] + - [717, 8141.08] + - - [289, 2016, 1, 256] + - [714, 5404.15] + - - [196, 512, 32, 128] + - [759, 6366.92] + - - [289, 768, 32, 160] + - [761, 8253.98] + - - [64, 1536, 32, 384] + - [732, 9508.6] + - - [64, 1280, 32, 320] + - [732, 9070.83] + - - [289, 896, 1, 128] + - [742, 2917.78] + - - [289, 3456, 1, 384] + - [722, 7275.01] + - - [196, 800, 1, 64] + - [744, 1393.88] + - - [64, 1280, 32, 384] + - [728, 9225.11] + - - [64, 1344, 1, 512] + - [747, 3041.55] + - - [1001, 4096, 1, 512] + - [728, 9391.87] + - - [1225, 192, 32, 64] + - [701, 7729.39] + - - [64, 1152, 1, 384] + - [751, 2440.75] + - - [729, 1600, 1, 192] + - [713, 6827.81] + - - [289, 1344, 1, 192] + - [711, 4439.14] + - - [784, 192, 32, 16] + - [738, 3663.14] + - - [3136, 1024, 1, 2048] + - [720, 9071.87] + - - [64, 1152, 1, 448] + - [748, 2564.55] + - - [49, 832, 32, 128] + - [711, 4733.26] + - - [784, 256, 32, 128] + - [724, 8471.7] + - - [49, 800, 1, 128] + - [746, 633.635] + - - [196, 512, 32, 32] + - [708, 4354.36] + - - [1225, 384, 32, 96] + - [725, 8751.73] + - - [5041, 576, 1, 96] + - [707, 7067.73] + - - [49, 832, 32, 48] + - [740, 3316.82] + - - [3136, 64, 64, 256] + - [762, 9722.0] + - - [5329, 160, 32, 64] + - [764, 8159.94] + - - [1225, 288, 32, 48] + - [754, 6673.75] + - - [4096, 9216, 1, 512] + - [736, 10117.0] + - - [196, 480, 32, 192] + - [765, 6388.56] + - - [64, 1152, 1, 256] + - [752, 1982.7] + - - [3136, 1024, 1, 512] + - [720, 8745.67] + - - [49, 832, 32, 32] + - [739, 2717.97] + - - [784, 192, 32, 64] + - [703, 7216.42] + - - [289, 1024, 32, 128] + - [726, 7970.6] + - - [289, 768, 32, 192] + - [770, 8327.37] + - - [289, 1120, 1, 192] + - [710, 3717.0] + - - [196, 512, 32, 112] + - [716, 6252.91] + - - [1001, 2048, 1, 32] + - [704, 4000.19] + - - [1225, 288, 32, 64] + - [764, 7208.14] + - - [196, 600, 1, 64] + - [743, 1094.05] + - - [1225, 384, 32, 192] + - [725, 9332.76] + - - [50176, 256, 1, 512] + - [735, 9833.64] + - - [196, 512, 32, 160] + - [762, 6614.44] + - - [4096, 4096, 1, 512] + - [733, 10032.3] + - - [49, 832, 32, 192] + - [711, 5244.63] + - - [1225, 256, 32, 64] + - [701, 7972.45] + - - [64, 2048, 32, 320] + - [729, 9404.37] + - - [196, 480, 32, 16] + - [755, 2724.59] + - - [1225, 256, 32, 48] + - [703, 7100.48] + - - [64, 1280, 32, 448] + - [728, 9344.51] + - - [1225, 1200, 1, 64] + - [697, 5157.99] + - - [1225, 384, 32, 64] + - [701, 8220.06] + - - [12544, 512, 1, 1024] + - [720, 9672.82] + - - [64, 1280, 32, 192] + - [716, 8525.11] + - - [196, 512, 32, 64] + - [701, 5489.44] + - - [289, 1792, 1, 256] + - [719, 4831.71] + - - [196, 528, 32, 256] + - [737, 6453.92] + - - [49, 512, 64, 2048] + - [772, 7549.08] + - - [64, 2048, 32, 192] + - [724, 8955.91] + - - [784, 512, 64, 128] + - [724, 9160.83] + - - [784, 128, 64, 512] + - [731, 9280.79] + - - [196, 528, 32, 160] + - [765, 6161.25] + - - [1225, 192, 32, 48] + - [701, 7237.02] + - - [64, 1728, 1, 192] + - [751, 2480.67] + - - [1001, 2048, 1, 64] + - [777, 5714.52] + - - [5329, 64, 128, 80] + - [784, 8835.39] + - - [64, 1280, 128, 448] + - [782, 10020.6] + - - [289, 768, 128, 128] + - [785, 8542.81] + - - [1225, 192, 128, 64] + - [774, 8444.87] + - - [1225, 288, 128, 48] + - [787, 7244.76] + - - [289, 768, 128, 192] + - [789, 8794.59] + - - [289, 768, 128, 160] + - [786, 8705.43] + - - [64, 2048, 128, 192] + - [780, 9780.36] + - - [64, 1280, 128, 384] + - [783, 9951.0] + - - [1225, 256, 128, 48] + - [775, 8273.71] + - - [1225, 192, 128, 48] + - [775, 8140.42] + - - [1225, 288, 128, 64] + - [787, 7886.31] + - - [64, 1280, 128, 320] + - [779, 9894.66] + - - [1225, 256, 128, 64] + - [780, 8572.61] + - - [1001, 2048, 1, 128] + - [781, 7289.16] + - - [1225, 192, 128, 32] + - [776, 7104.67] + - - [64, 1280, 128, 192] + - [788, 9642.18] + - - [1001, 1536, 1, 64] + - [778, 5146.66] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Ailk_Bljk_SB.yaml index 34701116d..0596e8cd4 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Ailk_Bljk_SB.yaml @@ -32091,8 +32091,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32255,8 +32255,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32419,8 +32419,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32583,8 +32583,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32747,8 +32747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32911,8 +32911,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33075,8 +33075,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33239,8 +33239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33399,8 +33399,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33563,8 +33563,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33723,8 +33723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33887,8 +33887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34051,8 +34051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34215,8 +34215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34379,8 +34379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34543,8 +34543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34707,8 +34707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34871,8 +34871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35035,8 +35035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35199,8 +35199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35363,8 +35363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35527,8 +35527,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35691,8 +35691,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35855,8 +35855,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36019,8 +36019,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36186,8 +36186,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36349,8 +36349,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36516,8 +36516,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36679,8 +36679,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36846,8 +36846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37009,8 +37009,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37176,8 +37176,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37339,8 +37339,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37506,8 +37506,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37667,8 +37667,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37828,8 +37828,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37991,8 +37991,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38158,8 +38158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38323,8 +38323,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38486,8 +38486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38653,8 +38653,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38816,8 +38816,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38983,8 +38983,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39146,8 +39146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39309,8 +39309,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39474,8 +39474,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39637,8 +39637,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39800,8 +39800,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39965,8 +39965,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40128,8 +40128,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40291,8 +40291,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40452,8 +40452,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40613,8 +40613,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40774,8 +40774,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40935,8 +40935,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41100,8 +41100,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41263,8 +41263,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41430,8 +41430,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41593,8 +41593,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41756,8 +41756,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41915,8 +41915,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42078,8 +42078,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42239,8 +42239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42404,8 +42404,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42565,8 +42565,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42726,8 +42726,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42887,8 +42887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43052,8 +43052,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43213,8 +43213,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43374,8 +43374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43535,8 +43535,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43696,8 +43696,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43857,8 +43857,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44018,8 +44018,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44179,8 +44179,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44340,8 +44340,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44501,8 +44501,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44662,8 +44662,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44823,8 +44823,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44984,8 +44984,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45145,8 +45145,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45306,8 +45306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45467,8 +45467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45628,8 +45628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45787,8 +45787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45947,8 +45947,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46107,8 +46107,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46267,8 +46267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46427,8 +46427,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46587,8 +46587,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46747,8 +46747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46911,8 +46911,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47071,8 +47071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47231,8 +47231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47391,8 +47391,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47551,8 +47551,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47711,8 +47711,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47871,8 +47871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48035,8 +48035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48195,8 +48195,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48359,8 +48359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48519,8 +48519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48683,8 +48683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48843,8 +48843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49003,8 +49003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49163,8 +49163,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49323,8 +49323,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49483,8 +49483,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49647,8 +49647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49811,8 +49811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49975,8 +49975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50135,8 +50135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50299,8 +50299,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50463,8 +50463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50623,8 +50623,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50787,8 +50787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50951,8 +50951,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51111,8 +51111,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51275,8 +51275,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51439,8 +51439,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51603,8 +51603,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51763,8 +51763,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51927,8 +51927,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52087,8 +52087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52251,8 +52251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52415,8 +52415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52579,8 +52579,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52739,8 +52739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52903,8 +52903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53067,8 +53067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53231,8 +53231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53395,8 +53395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53559,8 +53559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53723,8 +53723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53887,8 +53887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54051,8 +54051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54215,8 +54215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54375,8 +54375,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54539,8 +54539,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54703,8 +54703,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54867,8 +54867,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55031,8 +55031,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55195,8 +55195,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55359,8 +55359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55519,8 +55519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55679,8 +55679,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55839,8 +55839,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55999,8 +55999,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56159,8 +56159,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56319,8 +56319,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56479,8 +56479,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56639,8 +56639,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56799,8 +56799,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56959,8 +56959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57119,8 +57119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57279,8 +57279,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57443,8 +57443,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57607,8 +57607,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57767,8 +57767,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57931,8 +57931,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58095,8 +58095,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58259,8 +58259,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58419,8 +58419,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58583,8 +58583,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58743,8 +58743,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58907,8 +58907,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59071,8 +59071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59231,8 +59231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59395,8 +59395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59559,8 +59559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59723,8 +59723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59887,8 +59887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60051,8 +60051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60215,8 +60215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60379,8 +60379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60543,8 +60543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60707,8 +60707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60871,8 +60871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61035,8 +61035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61199,8 +61199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61363,8 +61363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61527,8 +61527,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61691,8 +61691,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61855,8 +61855,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62015,8 +62015,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62179,8 +62179,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62343,8 +62343,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62507,8 +62507,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62671,8 +62671,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62831,8 +62831,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62991,8 +62991,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63155,8 +63155,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63319,8 +63319,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63483,8 +63483,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63647,8 +63647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63807,8 +63807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63971,8 +63971,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64135,8 +64135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64295,8 +64295,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64459,8 +64459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64619,8 +64619,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64783,8 +64783,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64943,8 +64943,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65103,8 +65103,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65267,8 +65267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65431,8 +65431,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65595,8 +65595,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65759,8 +65759,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65923,8 +65923,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66087,8 +66087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66251,8 +66251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66411,8 +66411,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66575,8 +66575,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66739,8 +66739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66903,8 +66903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67067,8 +67067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67231,8 +67231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67395,8 +67395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67559,8 +67559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67723,8 +67723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67887,8 +67887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68051,8 +68051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68215,8 +68215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68379,8 +68379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68543,8 +68543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68707,8 +68707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68871,8 +68871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69035,8 +69035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69199,8 +69199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69359,8 +69359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69519,8 +69519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69683,8 +69683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69843,8 +69843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70003,8 +70003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70167,8 +70167,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70327,8 +70327,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70491,8 +70491,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70651,8 +70651,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70811,8 +70811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70975,8 +70975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71139,8 +71139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71303,8 +71303,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71467,8 +71467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71631,8 +71631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71795,8 +71795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71959,8 +71959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72123,8 +72123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72287,8 +72287,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72451,8 +72451,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72611,8 +72611,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72775,8 +72775,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72939,8 +72939,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73103,8 +73103,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73267,8 +73267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73431,8 +73431,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73595,8 +73595,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73759,8 +73759,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73923,8 +73923,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74087,8 +74087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74251,8 +74251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74415,8 +74415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74579,8 +74579,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74739,8 +74739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74903,8 +74903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75067,8 +75067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75231,8 +75231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75395,8 +75395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75555,8 +75555,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75715,8 +75715,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75879,8 +75879,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76043,8 +76043,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76203,8 +76203,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76363,8 +76363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76523,8 +76523,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76683,8 +76683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76843,8 +76843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77003,8 +77003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77163,8 +77163,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77327,8 +77327,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77487,8 +77487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77647,8 +77647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77811,8 +77811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77975,8 +77975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78139,8 +78139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78303,8 +78303,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78467,8 +78467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78631,8 +78631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78795,8 +78795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78959,8 +78959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79123,8 +79123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79292,8 +79292,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79457,8 +79457,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79624,8 +79624,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79791,8 +79791,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79958,8 +79958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80125,8 +80125,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80294,8 +80294,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80459,8 +80459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80628,8 +80628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80795,8 +80795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80962,8 +80962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81129,8 +81129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81296,8 +81296,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81463,8 +81463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81630,8 +81630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81795,8 +81795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81962,8 +81962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82129,8 +82129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82296,8 +82296,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82463,8 +82463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82630,8 +82630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82797,8 +82797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82966,8 +82966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83133,8 +83133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83300,8 +83300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83467,8 +83467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83634,8 +83634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83801,8 +83801,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83968,8 +83968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84135,8 +84135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84300,8 +84300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84467,8 +84467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84632,8 +84632,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84799,8 +84799,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84964,8 +84964,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85131,8 +85131,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85300,8 +85300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85467,8 +85467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85634,8 +85634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85801,8 +85801,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85966,8 +85966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86133,8 +86133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86300,8 +86300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86469,8 +86469,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86636,8 +86636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86803,8 +86803,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86968,8 +86968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87135,8 +87135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87304,8 +87304,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87471,8 +87471,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87638,8 +87638,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87805,8 +87805,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87972,8 +87972,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -88139,8 +88139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -88186,23 +88186,23 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -88210,37 +88210,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88248,10 +88245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88259,26 +88256,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88288,6 +88293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88297,6 +88303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88311,47 +88318,55 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 559 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id002 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -88359,37 +88374,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88397,10 +88409,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88408,26 +88420,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88437,6 +88457,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88446,6 +88467,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88460,33 +88482,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 560 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -88501,7 +88531,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88509,47 +88539,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88557,19 +88588,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -88577,6 +88615,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88586,6 +88625,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88595,6 +88635,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88609,14 +88650,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 561 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -88627,63 +88675,69 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2560 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88692,9 +88746,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88702,26 +88756,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88731,6 +88793,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88740,6 +88803,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88754,48 +88818,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 562 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88803,43 +88875,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88847,26 +88924,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88876,6 +88959,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88885,6 +88969,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88899,48 +88984,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 563 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88948,43 +89043,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88992,26 +89092,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89021,6 +89127,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89030,6 +89137,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89044,14 +89152,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 564 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -89062,74 +89177,82 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id007 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89137,26 +89260,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89166,6 +89297,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89175,6 +89307,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89189,96 +89322,105 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 565 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89286,19 +89428,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -89306,6 +89453,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89315,6 +89463,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89324,6 +89473,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89338,48 +89488,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 566 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89387,43 +89547,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89431,16 +89596,42405 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 - PerformanceSyncLocation: -1 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 784 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2080 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4224 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 520 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1040 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3136 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 800 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1680 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1296 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1312 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1312 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 699 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 708 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 16 + LSPA: 4 + LSPB: 12 + LVCA: 48 + LVCB: 16 + LVPA: 4 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 12, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 12 + LSCB: 16 + LSPA: 16 + LSPB: 12 + LVCA: 12 + LVCB: 16 + LVPA: 16 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 24 + LSCB: 32 + LSPA: 8 + LSPB: 6 + LVCA: 24 + LVCB: 32 + LVPA: 8 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 24 + MacroTile1: 24 + MacroTileA: 24 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 3 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: [3, 4] + ThreadTile0: 3 + ThreadTile1: 4 + ThreadTileA: 3 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 790 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 791 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 792 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 793 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 794 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 795 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 796 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 797 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 798 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id031 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsOffsetA: 0 + LdsOffsetB: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 799 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 800 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 801 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 802 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 803 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 804 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id031 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsOffsetA: 0 + LdsOffsetB: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 805 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 806 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 807 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6144 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 808 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id031 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6144 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 809 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 810 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 811 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 812 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 813 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 814 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 815 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 816 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 817 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 818 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 819 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 820 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 821 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 822 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 823 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 824 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 825 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 826 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 827 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 828 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 829 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 830 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 831 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 832 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 833 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 6144 + LdsOffsetA: 0 + LdsOffsetB: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 @@ -89483,25 +132037,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 567 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 834 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -89515,7 +132069,156 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 835 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89524,7 +132227,7 @@ ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89532,43 +132235,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 128 LVCA: 32 - LVCB: 32 + LVCB: 2 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 24 MacroTile0: 128 - MacroTile1: 8 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 8 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89576,13 +132279,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -89628,14 +132331,308 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 568 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 + SolutionIndex: 836 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 837 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 838 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -89646,7 +132643,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id007 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -89668,35 +132665,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -89706,18 +132703,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89725,14 +132722,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -89777,25 +132774,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 569 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SolutionIndex: 839 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -89809,7 +132806,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89817,52 +132814,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89870,14 +132867,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -89922,25 +132919,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 570 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 840 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -89954,60 +132951,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 128 + LSCB: 32 LSPA: 8 - LSPB: 4 + LSPB: 32 LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 4 - MacroTileA: 64 - MacroTileB: 4 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90015,20 +133016,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90067,25 +133068,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 571 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 841 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id007 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90099,7 +133100,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90107,33 +133108,33 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 128 + LSCB: 32 LSPA: 8 - LSPB: 4 + LSPB: 32 LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 8192 LdsOffsetA: 0 LdsOffsetB: 4096 LdsPadA: 0 @@ -90141,18 +133142,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 4 - MacroTileA: 64 - MacroTileB: 4 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90160,14 +133161,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -90212,25 +133213,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 572 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 + SolutionIndex: 842 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id003 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90244,7 +133245,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90252,56 +133253,201 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 - LSCB: 16 - LSPA: 4 - LSPB: 12 - LVCA: 48 - LVCB: 16 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 843 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 12 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90309,20 +133455,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90361,26 +133507,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 573 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 + SolutionIndex: 844 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 12, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -90393,7 +133539,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90401,39 +133547,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 12 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 12 - LVCA: 12 - LVCB: 16 - LVPA: 16 - LVPB: 12 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90446,11 +133592,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90458,15 +133604,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90510,26 +133656,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 574 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 - SubGroup0: 12 + SolutionIndex: 845 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 12 + SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -90542,47 +133688,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90595,11 +133737,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90607,20 +133749,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90659,26 +133801,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 575 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SolutionIndex: 846 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -90699,56 +133841,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 12 - LVCA: 24 - LVCB: 16 - LVPA: 4 - LVPB: 6 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90756,15 +133898,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90808,26 +133950,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 576 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 - SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 + SolutionIndex: 847 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id010 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -90846,58 +133988,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 24 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 6 - LVCA: 24 - LVCB: 32 - LVPA: 8 - LVPB: 6 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 24 - MacroTile1: 24 - MacroTileA: 24 - MacroTileB: 24 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90905,20 +134043,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 3 - NumGlobalWriteVectorsPerThread: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90957,26 +134095,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 577 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 + SolutionIndex: 848 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [3, 4] - ThreadTile0: 3 - ThreadTile1: 4 - ThreadTileA: 3 - ThreadTileB: 4 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id010 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -90997,56 +134135,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 12 - LVCA: 24 - LVCB: 16 - LVPA: 4 - LVPB: 6 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91054,15 +134192,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91106,26 +134244,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 578 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 - SubGroup0: 8 - SubGroup1: 6 - SubGroupA: 8 - SubGroupB: 6 - SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + SolutionIndex: 849 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id010 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -91138,7 +134276,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91146,7 +134284,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -91159,26 +134297,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 8 + LSCA: 16 + LSCB: 4 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91191,11 +134329,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91205,11 +134343,11 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 64 PerformanceSyncLocation: -1 @@ -91255,14 +134393,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 579 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 850 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -91273,7 +134411,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -91287,7 +134425,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 2 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91295,39 +134433,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Source - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 2 + LSPA: 2 + LSPB: 32 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91340,11 +134478,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91352,8 +134490,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -91404,71 +134542,71 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 580 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SolutionIndex: 851 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 8 - LVCB: 2 - LVPA: 2 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsNumElementsAlignedA: 256 @@ -91482,18 +134620,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91501,15 +134639,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91553,79 +134691,79 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 581 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SolutionIndex: 852 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppresssNoLoadLoop: true + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: *id044 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 4 LSPA: 4 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 2 - LVPA: 1 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91638,11 +134776,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91650,13 +134788,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 64 PerformanceSyncLocation: -1 @@ -91702,35 +134840,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 582 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 + SolutionIndex: 853 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: *id042 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -91740,9 +134878,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -91751,47 +134889,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 8 LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91799,15 +134937,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91851,47 +134989,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 583 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SolutionIndex: 854 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id012 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: *id043 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -91900,30 +135038,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 4 LSPA: 4 LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91936,10 +135074,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91948,15 +135086,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92000,35 +135138,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 584 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 855 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 + ThreadTile: *id041 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: *id042 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -92038,10 +135176,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -92049,47 +135187,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 + KernelLanguage: Assembly + LSCA: 8 LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 8 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92097,8 +135235,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -92149,35 +135287,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 585 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 856 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id012 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -92187,10 +135325,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -92198,46 +135336,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 16 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -92246,15 +135384,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92298,35 +135436,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 586 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SolutionIndex: 857 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 + ThreadTile: *id041 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -92336,58 +135474,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 8 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 8 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92395,8 +135533,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -92447,47 +135585,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 587 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 858 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -92496,26 +135634,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 896 + LdsNumElements: 1024 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -92525,17 +135663,17 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -92544,15 +135682,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92596,35 +135734,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 588 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 859 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id044 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -92634,9 +135772,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -92644,33 +135782,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 16 + LSPA: 8 + LSPB: 32 LVCA: 32 LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3360 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92682,10 +135820,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92693,15 +135831,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92745,47 +135883,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 589 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 860 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 + SuppresssNoLoadLoop: true + ThreadTile: *id045 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id046 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -92793,33 +135931,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 3360 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92830,11 +135968,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92842,15 +135980,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92894,35 +136032,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 590 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 + SolutionIndex: 861 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 SubGroup0: 32 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id046 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -92932,43 +136070,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92980,10 +136118,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92991,15 +136129,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93043,35 +136181,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 591 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 862 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id045 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -93081,43 +136219,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93129,10 +136267,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93140,15 +136278,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93192,35 +136330,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 592 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 863 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 + SuppresssNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id046 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -93231,42 +136369,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 LSCB: 16 LSPA: 8 LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 2 - LVPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93279,9 +136413,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93290,20 +136424,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93341,35 +136475,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 593 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SolutionIndex: 864 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 32 SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -93380,42 +136514,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 4 + LSPA: 2 LSPB: 16 - LVCA: 32 - LVCB: 8 + LVCA: 128 + LVCB: 16 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93427,10 +136557,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93438,21 +136568,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93490,95 +136620,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 594 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 865 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id047 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: *id048 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 16 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 256 LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 256 + LVCB: 16 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 256 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 256 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -93587,21 +136713,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93639,35 +136765,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 595 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 866 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 + ThreadTile: *id050 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: *id048 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -93678,42 +136804,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93726,9 +136848,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93736,21 +136858,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93788,81 +136910,77 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 596 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 867 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id047 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 LSPA: 4 LSPB: 16 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 384 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93873,11 +136991,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93885,21 +137003,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 6 - NumLoadsB: 3 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93937,81 +137055,77 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 597 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 868 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id047 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -94022,11 +137136,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94034,21 +137148,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94086,26 +137200,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 598 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SolutionIndex: 869 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id047 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -94118,64 +137232,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94183,21 +137293,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94235,31 +137345,30 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 599 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 870 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id050 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -94273,9 +137382,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -94283,27 +137391,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 2 LSPB: 32 - LVCA: 32 + LVCA: 128 LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -94313,18 +137421,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94332,14 +137440,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -94384,31 +137490,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 600 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 871 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + SubGroupB: 16 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: *id052 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -94417,13 +137523,12 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -94432,48 +137537,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 256 LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPA: 1 + LSPB: 32 + LVCA: 256 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 1 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94481,20 +137582,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94533,31 +137632,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 601 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 872 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id053 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -94571,10 +137670,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94582,47 +137680,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94630,15 +137728,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -94682,31 +137778,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 602 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 873 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id019 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -94714,53 +137810,52 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94769,9 +137864,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94779,10 +137874,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -94831,31 +137924,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 603 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SolutionIndex: 874 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 SubGroup0: 32 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id025 + VectorWidth: 2 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -94863,16 +137956,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94880,47 +137972,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94928,15 +138020,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -94980,31 +138070,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 604 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 875 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: *id055 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -95012,53 +138102,52 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95066,10 +138155,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95077,10 +138166,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -95129,31 +138216,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 605 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 876 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id055 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -95167,58 +138254,53 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95226,20 +138308,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95278,31 +138358,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 606 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SolutionIndex: 877 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 32 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -95316,8 +138396,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -95327,7 +138406,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -95335,39 +138414,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPB: 64 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95375,15 +138454,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -95427,14 +138504,13 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 607 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SolutionIndex: 878 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -95445,13 +138521,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -95465,8 +138542,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -95476,7 +138552,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -95486,37 +138562,33 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95524,20 +138596,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95576,31 +138646,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 608 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 879 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -95614,58 +138684,53 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95673,20 +138738,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95725,31 +138788,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 609 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 880 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -95763,58 +138826,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95822,13 +138884,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -95874,31 +138934,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 610 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 881 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id051 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -95912,58 +138972,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95971,10 +139030,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -96023,31 +139080,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 611 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 882 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -96061,10 +139118,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96072,26 +139128,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -96101,18 +139157,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96120,10 +139176,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -96172,48 +139226,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 612 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SolutionIndex: 883 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 SubGroup0: 32 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 + WorkGroup: *id053 WorkGroupMapping: 8 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96221,7 +139276,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -96229,18 +139284,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96250,18 +139305,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96269,15 +139324,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96321,17 +139379,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 613 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 884 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -96339,15 +139404,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -96360,9 +139428,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96370,7 +139438,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -96381,15 +139449,15 @@ LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -96399,18 +139467,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96418,15 +139486,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96470,47 +139541,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 614 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 885 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -96518,37 +139599,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96556,10 +139637,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96567,15 +139648,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96619,33 +139703,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 615 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 886 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -96658,7 +139752,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -96668,47 +139762,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96716,15 +139810,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96768,33 +139865,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 616 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 887 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -96807,7 +139914,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -96817,47 +139924,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96865,15 +139972,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96917,48 +140027,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 617 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 888 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96966,47 +140086,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97014,15 +140134,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -97066,96 +140189,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 618 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 889 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97163,15 +140296,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -97215,33 +140351,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 619 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 890 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -97254,42 +140400,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 1856 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97312,15 +140458,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -97364,33 +140513,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 620 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 891 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -97403,57 +140562,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97461,15 +140620,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -97513,33 +140675,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 621 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 892 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -97552,9 +140724,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97562,32 +140734,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97599,10 +140771,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97610,15 +140782,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -97662,33 +140837,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 622 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 893 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -97701,9 +140886,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97711,7 +140896,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -97719,24 +140904,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97748,10 +140933,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97759,15 +140944,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -97811,81 +140999,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 623 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 894 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -97896,7 +141094,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -97908,15 +141106,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -97960,46 +141161,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 624 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 895 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98009,32 +141220,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98045,10 +141256,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98059,13 +141270,16 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98109,14 +141323,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 625 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 896 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -98127,15 +141348,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -98148,7 +141372,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98158,47 +141382,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98206,15 +141430,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98258,14 +141485,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 626 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 897 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -98276,15 +141510,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -98297,57 +141534,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98355,15 +141592,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98407,33 +141647,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 627 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 898 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -98446,57 +141696,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98504,15 +141754,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98556,33 +141809,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 628 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 899 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -98595,9 +141858,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -98605,7 +141868,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -98613,39 +141876,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98653,15 +141916,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98705,17 +141971,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 629 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 900 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -98723,28 +141996,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98754,32 +142030,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -98790,10 +142066,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98804,13 +142080,16 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98854,14 +142133,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 630 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 901 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -98872,15 +142158,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -98893,57 +142182,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98951,15 +142240,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99003,33 +142295,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 631 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 902 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -99042,57 +142344,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99100,15 +142402,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99152,46 +142457,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 632 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 903 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -99201,32 +142516,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 64 LVCA: 16 - LVCB: 2 - LVPA: 2 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99237,7 +142552,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -99249,15 +142564,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99301,17 +142619,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 633 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 904 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -99319,30 +142644,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99350,32 +142678,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99386,11 +142714,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99398,15 +142726,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99450,33 +142781,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 634 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 905 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -99489,57 +142830,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99547,15 +142888,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99599,96 +142943,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 635 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 906 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99696,15 +143050,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99748,48 +143105,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 636 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 907 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99797,36 +143164,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -99834,9 +143201,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99845,15 +143212,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99897,33 +143267,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 637 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 908 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -99936,42 +143316,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99983,10 +143363,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99994,15 +143374,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -100046,47 +143429,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 638 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 909 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -100094,37 +143487,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100132,10 +143525,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100143,15 +143536,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -100172,6 +143568,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100181,6 +143578,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100195,48 +143593,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 639 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 910 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -100244,26 +143652,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -100273,7 +143681,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100281,10 +143689,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100292,15 +143700,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -100321,6 +143732,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100330,6 +143742,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100344,48 +143757,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 640 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 911 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -100393,26 +143816,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 256 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6400 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -100422,7 +143845,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100430,9 +143853,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 256 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 256 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100441,15 +143864,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -100470,6 +143896,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100479,6 +143906,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100493,47 +143921,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 641 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 912 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -100541,48 +143979,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100590,15 +144028,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -100619,6 +144060,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100628,6 +144070,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100642,75 +144085,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 642 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 913 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -100720,7 +144173,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100728,10 +144181,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100739,15 +144192,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -100768,6 +144224,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100777,6 +144234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100791,48 +144249,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 643 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 914 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -100840,36 +144308,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100878,9 +144346,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100888,15 +144356,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -100917,6 +144388,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -100926,6 +144398,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -100940,17 +144413,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 644 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 915 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -100958,29 +144438,32 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 15 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -100988,23 +144471,23 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -101018,7 +144501,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101026,10 +144509,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101037,15 +144520,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -101066,6 +144552,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101075,6 +144562,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101089,48 +144577,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 645 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 916 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -101138,26 +144636,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -101167,7 +144665,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101175,10 +144673,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101186,15 +144684,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -101215,6 +144716,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101224,6 +144726,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101238,17 +144741,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 646 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 917 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -101256,30 +144766,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -101287,36 +144800,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101324,10 +144833,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101335,20 +144844,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -101364,6 +144876,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101373,6 +144886,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101387,17 +144901,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 647 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -101405,30 +144926,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -101436,47 +144960,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101484,15 +145008,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -101513,6 +145040,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101522,6 +145050,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101536,48 +145065,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 648 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -101585,47 +145124,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 16 + MacroTileA: 256 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101633,20 +145168,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -101662,6 +145200,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101671,6 +145210,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101685,75 +145225,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 649 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -101763,18 +145313,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101782,15 +145332,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -101811,6 +145364,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101820,6 +145374,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101834,85 +145389,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 650 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 921 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101921,9 +145486,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101931,15 +145496,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -101960,6 +145528,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -101969,6 +145538,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -101983,46 +145553,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 651 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102032,47 +145612,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102080,15 +145660,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102109,6 +145692,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102118,6 +145702,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102132,75 +145717,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 652 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -102210,7 +145805,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102218,10 +145813,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102229,15 +145824,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102258,6 +145856,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102267,6 +145866,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102281,46 +145881,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 653 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 924 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102330,24 +145940,24 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -102359,7 +145969,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102367,10 +145977,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102378,15 +145988,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102407,6 +146020,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102416,6 +146030,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102430,46 +146045,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 654 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 925 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 11 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102479,47 +146104,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102527,15 +146152,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102556,6 +146184,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102565,6 +146194,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102579,17 +146209,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 655 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -102597,61 +146234,64 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102664,11 +146304,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102676,15 +146316,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102705,6 +146348,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102714,6 +146358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102728,75 +146373,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 656 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 927 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -102806,7 +146461,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102814,10 +146469,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102825,15 +146480,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102854,6 +146512,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -102863,6 +146522,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -102877,33 +146537,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 657 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 928 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -102916,57 +146586,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102974,15 +146644,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103003,6 +146676,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -103012,6 +146686,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103026,35 +146701,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 658 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 929 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103064,10 +146749,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103080,21 +146765,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103109,9 +146798,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103119,20 +146808,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103148,6 +146840,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -103157,6 +146850,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103171,35 +146865,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 659 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 930 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103209,8 +146913,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103225,9 +146929,9 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 @@ -103237,9 +146941,13 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103266,18 +146974,21 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103293,6 +147004,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -103302,6 +147014,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103316,35 +147029,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 660 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 931 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103354,8 +147077,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103370,25 +147093,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103402,9 +147125,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -103413,15 +147136,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103442,6 +147168,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -103451,6 +147178,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103465,35 +147193,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 661 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 932 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103503,8 +147241,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103519,25 +147257,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 + LSPB: 128 + LVCA: 32 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103551,9 +147289,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -103564,13 +147302,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103591,6 +147332,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -103600,6 +147342,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103614,35 +147357,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 662 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 933 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103652,37 +147405,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103696,10 +147453,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103707,20 +147464,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103736,6 +147496,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -103745,6 +147506,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103759,48 +147521,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 663 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 934 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103813,21 +147585,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103840,7 +147616,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -103854,24 +147630,30 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -103881,6 +147663,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -103890,6 +147673,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -103904,14 +147688,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 664 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 935 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -103922,30 +147713,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103958,25 +147750,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103989,7 +147781,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -104003,13 +147795,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -104021,6 +147816,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104030,6 +147826,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -104039,6 +147836,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104053,14 +147851,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 665 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 936 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -104071,30 +147876,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104107,21 +147915,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104134,10 +147946,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104146,8 +147958,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -104155,17 +147967,23 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104175,6 +147993,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -104184,6 +148003,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104198,17 +148018,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 666 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 937 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 @@ -104216,30 +148043,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104252,21 +148080,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -104283,10 +148111,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104295,15 +148123,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -104315,6 +148146,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104324,6 +148156,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -104333,6 +148166,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104347,17 +148181,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 667 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 938 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 @@ -104365,17 +148206,20 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104385,8 +148229,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104401,25 +148245,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104433,9 +148277,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104444,15 +148288,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -104464,6 +148313,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104473,6 +148323,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -104482,6 +148333,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104496,17 +148348,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 668 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 939 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 @@ -104514,17 +148373,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104534,37 +148394,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104578,9 +148442,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104589,26 +148453,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104618,6 +148486,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -104627,6 +148496,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104641,35 +148511,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 669 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 940 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104679,8 +148559,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104695,9 +148575,9 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 @@ -104707,9 +148587,9 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -104728,9 +148608,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104738,15 +148618,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -104758,6 +148643,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104767,6 +148653,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -104776,6 +148663,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104790,46 +148678,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 670 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 941 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104844,21 +148740,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104871,11 +148771,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104883,26 +148783,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -104912,6 +148816,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -104921,6 +148826,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -104935,77 +148841,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 671 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 942 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105016,11 +148936,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105028,26 +148948,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105057,6 +148983,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -105066,6 +148993,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105080,77 +149008,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 672 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 943 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105161,10 +149097,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -105173,15 +149109,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105193,6 +149134,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105202,6 +149144,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -105211,6 +149154,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105225,77 +149169,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 673 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 944 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105306,7 +149258,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -105320,13 +149272,18 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105338,6 +149295,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105347,6 +149305,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -105356,6 +149315,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105370,81 +149330,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 674 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 945 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105455,10 +149423,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -105467,15 +149435,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105487,6 +149458,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105496,6 +149468,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -105505,6 +149478,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105519,39 +149493,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 675 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 946 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105559,41 +149543,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105604,11 +149588,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105616,26 +149600,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105645,6 +149635,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -105654,6 +149645,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105668,39 +149660,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 676 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105708,41 +149708,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 64 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6688 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105753,11 +149753,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105765,26 +149765,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105794,6 +149800,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -105803,6 +149810,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105817,33 +149825,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 677 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 948 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -105855,39 +149871,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105900,9 +149920,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105910,26 +149930,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -105939,6 +149963,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -105948,6 +149973,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -105962,39 +149988,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 678 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 949 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + SuppressNoLoadLoop: true + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106003,7 +150039,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106016,27 +150052,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106047,10 +150083,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -106059,8 +150095,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -106068,6 +150104,11 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -106079,6 +150120,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106088,6 +150130,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -106097,6 +150140,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -106111,48 +150155,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 679 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 950 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106165,23 +150217,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106192,11 +150248,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106206,24 +150262,28 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106233,6 +150293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -106242,6 +150303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -106256,39 +150318,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 680 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 951 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106297,7 +150369,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106310,27 +150382,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106341,11 +150413,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106356,12 +150428,17 @@ NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -106373,6 +150450,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106382,6 +150460,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -106391,6 +150470,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -106405,47 +150485,55 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 681 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 952 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -106453,33 +150541,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106490,11 +150578,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106502,15 +150590,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -106522,6 +150613,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106531,6 +150623,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -106540,6 +150633,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -106554,33 +150648,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 682 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 953 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -106608,23 +150712,23 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106636,9 +150740,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -106647,15 +150751,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -106667,6 +150776,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106676,6 +150786,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -106685,6 +150796,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -106699,33 +150811,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 683 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 954 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -106753,7 +150873,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -106765,15 +150885,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106805,6 +150925,11 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -106816,6 +150941,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106825,6 +150951,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -106834,6 +150961,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -106848,33 +150976,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 684 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 955 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -106887,42 +151023,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106935,9 +151071,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106945,15 +151081,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -106965,6 +151104,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -106974,6 +151114,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -106983,6 +151124,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -106997,33 +151139,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 685 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 956 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107051,23 +151203,23 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -107079,10 +151231,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107090,15 +151242,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107110,6 +151267,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107119,6 +151277,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -107128,6 +151287,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -107142,33 +151302,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 686 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 957 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107196,7 +151364,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -107208,15 +151376,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -107229,9 +151397,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107239,15 +151407,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107259,6 +151432,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107268,6 +151442,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -107277,6 +151452,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -107291,33 +151467,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 687 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 958 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107329,8 +151513,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107345,23 +151529,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -107373,9 +151561,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -107384,26 +151572,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107413,6 +151605,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -107422,6 +151615,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -107436,79 +151630,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 688 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 959 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107521,11 +151721,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107534,25 +151734,31 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107562,6 +151768,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -107571,6 +151778,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -107585,39 +151793,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 689 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 960 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107625,35 +151841,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107666,11 +151882,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107678,26 +151894,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107707,6 +151929,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -107716,6 +151939,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -107730,79 +151954,83 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 690 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 961 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107815,11 +152043,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107827,26 +152055,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -107856,6 +152090,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -107865,6 +152100,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -107879,33 +152115,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 691 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 962 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107917,41 +152161,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107965,10 +152205,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107976,26 +152216,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108005,6 +152251,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -108014,6 +152261,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -108028,33 +152276,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 692 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 963 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108068,35 +152324,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108110,10 +152366,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108121,15 +152377,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108141,6 +152402,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108150,6 +152412,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -108159,6 +152422,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -108173,39 +152437,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 693 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 964 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108214,7 +152486,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108227,25 +152499,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108258,11 +152530,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108270,15 +152542,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108290,6 +152567,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108299,6 +152577,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -108308,6 +152587,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -108322,17 +152602,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 694 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 965 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -108340,29 +152627,30 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -108370,31 +152658,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 64 - LVCB: 8 + LVCA: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108407,11 +152695,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108419,15 +152707,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108439,6 +152730,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108448,6 +152740,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -108457,6 +152750,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -108471,45 +152765,55 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 695 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 966 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -108525,21 +152829,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 32 - LVCB: 4 + LVCA: 16 + LVCB: 2 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108552,11 +152860,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108564,26 +152872,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108593,6 +152907,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -108602,6 +152917,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -108616,17 +152932,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 696 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 967 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -108634,30 +152957,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108670,25 +152994,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108701,11 +153025,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108713,15 +153037,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108733,6 +153060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108742,6 +153070,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -108751,6 +153080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -108765,39 +153095,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 697 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 968 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108819,21 +153159,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 64 + LSCB: 16 LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 + LSPB: 32 + LVCA: 16 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108846,11 +153186,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108858,15 +153198,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108878,6 +153223,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -108887,6 +153233,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -108896,6 +153243,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -108910,46 +153258,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 698 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 969 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108964,25 +153320,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 64 + LSCB: 16 LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 + LSPB: 32 + LVCA: 16 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108995,11 +153347,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109007,26 +153359,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109036,6 +153392,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -109045,6 +153402,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -109059,39 +153417,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 699 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 970 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109099,37 +153467,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109140,11 +153508,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109152,26 +153520,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109181,6 +153555,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -109190,6 +153565,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -109204,81 +153580,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 700 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 971 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109289,11 +153669,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109301,26 +153681,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109330,6 +153716,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -109339,6 +153726,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -109353,77 +153741,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 701 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 972 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109434,7 +153834,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -109447,7 +153847,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -109455,17 +153855,23 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109475,6 +153881,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -109484,6 +153891,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -109498,81 +153906,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 702 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 973 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109583,11 +153995,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109596,25 +154008,31 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109624,6 +154042,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -109633,6 +154052,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -109647,39 +154067,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 703 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 974 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109687,37 +154115,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109728,11 +154156,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109740,26 +154168,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109769,6 +154203,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -109778,6 +154213,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -109792,81 +154228,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 704 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 975 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109877,11 +154317,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109889,26 +154329,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -109918,6 +154364,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -109927,6 +154374,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -109941,77 +154389,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 705 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 976 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 + LVCA: 64 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110022,11 +154482,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110034,26 +154494,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -110063,6 +154529,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -110072,6 +154539,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -110086,81 +154554,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 706 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 977 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110171,11 +154643,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110183,26 +154655,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -110212,6 +154690,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -110221,6 +154700,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -110235,39 +154715,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 707 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 978 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110275,37 +154763,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110316,11 +154804,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110328,15 +154816,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110348,6 +154841,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -110357,6 +154851,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -110366,6 +154861,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -110380,81 +154876,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 708 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 979 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110465,7 +154965,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -110478,25 +154978,31 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -110506,6 +155012,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -110515,6 +155022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -110529,39 +155037,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 709 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 980 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110569,37 +155085,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110610,11 +155126,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110622,7 +155138,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 @@ -110631,17 +155147,23 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -110651,6 +155173,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -110660,6 +155183,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -110674,81 +155198,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 710 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 981 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110759,11 +155287,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110771,7 +155299,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 @@ -110780,17 +155308,23 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -110800,6 +155334,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -110809,6 +155344,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -110823,39 +155359,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 711 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 982 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110863,37 +155407,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110904,11 +155448,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110916,15 +155460,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110936,6 +155485,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -110945,6 +155495,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -110954,6 +155505,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -110968,81 +155520,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 712 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 983 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111053,11 +155609,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111065,26 +155621,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -111094,6 +155656,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111103,6 +155666,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111117,46 +155681,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 713 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 984 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -111165,33 +155737,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111202,10 +155770,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111214,26 +155782,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -111243,6 +155817,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111252,6 +155827,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111266,46 +155842,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 714 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 985 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id040 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -111314,33 +155898,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 2 - LSPA: 2 - LSPB: 32 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111351,11 +155931,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111364,25 +155944,31 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -111392,6 +155978,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111401,6 +155988,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111415,33 +156003,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 715 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 986 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -111453,7 +156049,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -111463,7 +156059,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -111472,39 +156068,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 - LSPA: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 - LVPA: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111512,26 +156104,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -111541,6 +156139,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111550,6 +156149,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111564,45 +156164,53 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 716 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id041 - ThreadTile0: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 987 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -111612,33 +156220,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111649,10 +156253,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111661,26 +156265,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -111690,6 +156300,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111699,6 +156310,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111713,45 +156325,53 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 717 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 988 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -111761,8 +156381,8 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -111770,39 +156390,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3104 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111810,26 +156426,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -111839,6 +156461,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111848,6 +156471,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111862,45 +156486,53 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 718 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 989 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -111910,33 +156542,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 + LSCA: 64 + LSCB: 8 + LSPA: 2 LSPB: 16 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCA: 64 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -111947,11 +156575,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111959,26 +156587,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -111988,6 +156622,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111997,6 +156632,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112011,33 +156647,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 719 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 990 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112049,7 +156693,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -112059,48 +156703,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112108,26 +156748,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -112137,6 +156783,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112146,6 +156793,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112160,45 +156808,53 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 720 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 991 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -112208,48 +156864,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112257,26 +156909,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -112286,6 +156944,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112295,6 +156954,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112309,33 +156969,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 721 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 992 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112347,58 +157015,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112406,26 +157070,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -112435,6 +157105,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112444,6 +157115,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112458,96 +157130,100 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 722 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 993 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112555,26 +157231,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -112584,6 +157264,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112593,6 +157274,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112607,33 +157289,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 723 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 994 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112645,43 +157337,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112693,9 +157381,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112704,20 +157392,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -112733,6 +157424,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112742,6 +157434,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112756,33 +157449,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 724 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 995 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id046 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112794,58 +157497,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3360 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112853,21 +157552,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -112882,6 +157584,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112891,6 +157594,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112905,33 +157609,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 725 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 996 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id046 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112943,43 +157657,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112991,10 +157701,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113002,21 +157712,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -113031,6 +157744,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113040,6 +157754,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113054,33 +157769,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 726 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 997 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id045 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113092,43 +157817,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113140,10 +157861,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113151,20 +157872,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -113180,6 +157904,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113189,6 +157914,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113203,33 +157929,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 727 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 998 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id046 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113242,53 +157978,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113296,15 +158032,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -113325,6 +158064,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113334,6 +158074,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113348,33 +158089,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 728 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 999 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 32 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113387,53 +158138,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 2 + LSPA: 16 LSPB: 16 - LVCA: 128 - LVCB: 16 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113441,21 +158192,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113470,6 +158224,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113479,6 +158234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113493,92 +158249,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 729 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id048 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 16 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 16 LSCB: 16 - LSPA: 1 + LSPA: 16 LSPB: 16 - LVCA: 256 - LVCB: 16 - LVPA: 1 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4640 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113586,21 +158356,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113615,6 +158388,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113624,6 +158398,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113638,33 +158413,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 730 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id050 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id048 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113677,52 +158462,52 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -113731,21 +158516,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113760,6 +158548,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113769,6 +158558,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113783,33 +158573,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 731 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113822,38 +158622,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113865,10 +158665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113876,15 +158676,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -113905,6 +158708,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113914,6 +158718,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113928,33 +158733,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 732 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id049 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113967,38 +158782,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114010,9 +158825,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114021,15 +158836,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114050,6 +158868,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114059,6 +158878,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114073,33 +158893,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 733 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id049 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114112,38 +158942,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114155,9 +158985,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114166,21 +158996,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -114195,6 +159028,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114204,6 +159038,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114218,94 +159053,102 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 734 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id050 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id049 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114313,19 +159156,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -114340,6 +159188,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114349,6 +159198,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114363,91 +159213,102 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 735 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id052 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114455,19 +159316,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -114482,6 +159348,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114491,6 +159358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114505,44 +159373,55 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 736 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - ThreadTile: *id051 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id053 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -114558,17 +159437,17 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -114582,7 +159461,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -114590,10 +159469,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114601,13 +159480,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114628,6 +159512,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114637,6 +159522,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114651,44 +159537,55 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 737 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id052 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -114702,33 +159599,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -114736,10 +159629,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114747,18 +159640,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114774,6 +159672,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114783,6 +159682,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114797,74 +159697,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 738 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id053 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1792 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -114874,7 +159785,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -114882,10 +159793,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114893,13 +159804,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114920,6 +159836,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114929,6 +159846,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114943,84 +159861,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 739 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id055 + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -115028,10 +159953,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115039,18 +159964,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115066,6 +159996,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115075,6 +160006,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115089,91 +160021,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 740 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id055 + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115181,18 +160128,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115208,6 +160160,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115217,6 +160170,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115231,33 +160185,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 741 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id051 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115269,42 +160233,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1056 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115316,10 +160277,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115327,18 +160288,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115354,6 +160320,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115363,6 +160330,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115377,33 +160345,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 742 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id052 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115415,38 +160393,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115458,10 +160437,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115469,19 +160448,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -115496,6 +160480,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115505,6 +160490,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115519,33 +160505,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 743 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id052 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115557,38 +160553,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115600,9 +160597,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115611,13 +160608,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115638,6 +160640,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115647,6 +160650,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115661,33 +160665,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 744 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id052 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115699,57 +160713,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 544 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115757,18 +160768,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115784,6 +160800,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115793,6 +160810,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115807,33 +160825,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 745 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id052 + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115845,57 +160873,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115903,18 +160928,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115930,6 +160960,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115939,6 +160970,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115953,33 +160985,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 746 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - ThreadTile: *id054 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id053 + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115991,57 +161033,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116049,13 +161092,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -116076,6 +161124,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116085,6 +161134,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116099,26 +161149,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 747 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - ThreadTile: *id054 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id053 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -116132,7 +161191,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116140,7 +161199,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -116148,33 +161207,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116185,11 +161244,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116197,12 +161256,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116229,6 +161288,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116238,6 +161298,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116256,8 +161317,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116266,21 +161327,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -116294,7 +161355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116302,56 +161363,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116359,12 +161420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116376,7 +161437,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -116391,6 +161452,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116400,6 +161462,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116418,31 +161481,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -116456,15 +161519,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -116472,37 +161535,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -116510,10 +161569,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116521,8 +161580,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -116537,7 +161596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -116553,6 +161612,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116562,6 +161622,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116580,31 +161641,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -116618,7 +161679,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116626,56 +161687,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116683,12 +161744,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116715,6 +161776,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116724,6 +161786,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116742,31 +161805,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -116780,7 +161843,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116788,56 +161851,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116845,12 +161908,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116862,7 +161925,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -116877,6 +161940,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116886,6 +161950,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116904,31 +161969,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -116948,7 +162013,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -116967,39 +162032,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117007,12 +162068,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -117023,7 +162084,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117039,6 +162100,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117048,6 +162110,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117066,29 +162129,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -117104,7 +162167,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117112,7 +162175,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -117120,25 +162183,25 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -117146,11 +162209,11 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117158,10 +162221,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117169,11 +162232,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -117201,6 +162264,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117210,6 +162274,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117228,31 +162293,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -117266,7 +162331,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117274,45 +162339,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 LVCA: 8 - LVCB: 4 - LVPA: 2 - LVPB: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 + LdsNumElements: 3648 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117320,10 +162385,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117331,13 +162396,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117348,7 +162413,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -117363,6 +162428,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117372,6 +162438,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117390,31 +162457,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -117428,64 +162495,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117493,12 +162556,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -117509,7 +162572,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117525,6 +162588,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117534,6 +162598,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117552,31 +162617,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -117590,7 +162655,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117598,7 +162663,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -117606,37 +162671,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117644,9 +162709,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -117655,12 +162720,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -117687,6 +162752,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117696,6 +162762,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117714,31 +162781,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -117752,7 +162819,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117760,45 +162827,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117806,10 +162873,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117817,13 +162884,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117849,6 +162916,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117858,6 +162926,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117876,31 +162945,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -117922,56 +162991,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 640 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117980,12 +163049,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118011,6 +163080,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118020,6 +163090,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118038,29 +163109,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -118082,58 +163153,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118141,11 +163208,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -118157,7 +163224,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118173,6 +163240,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118182,6 +163250,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118200,29 +163269,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -118238,7 +163307,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118246,56 +163315,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118303,11 +163372,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -118335,6 +163404,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118344,6 +163414,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118362,31 +163433,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -118400,15 +163471,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -118416,37 +163487,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118454,10 +163521,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118465,11 +163532,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -118481,7 +163548,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -118497,6 +163564,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118506,6 +163574,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118524,31 +163593,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -118562,7 +163631,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118570,45 +163639,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118616,10 +163685,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118627,11 +163696,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -118659,6 +163728,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118668,6 +163738,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118686,31 +163757,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -118724,7 +163795,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118732,7 +163803,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -118740,37 +163811,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118778,9 +163849,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -118789,12 +163860,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -118806,7 +163877,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -118821,6 +163892,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118830,6 +163902,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118848,31 +163921,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -118894,55 +163967,55 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -118951,12 +164024,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -118983,6 +164056,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118992,6 +164066,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119010,28 +164085,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -119048,64 +164123,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 832 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119113,13 +164184,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119129,7 +164200,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -119145,6 +164216,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119154,6 +164226,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119172,31 +164245,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -119210,7 +164283,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -119218,7 +164291,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -119226,37 +164299,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -119264,10 +164337,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119275,13 +164348,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119307,6 +164380,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119316,6 +164390,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119334,31 +164409,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1038 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -119372,7 +164447,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -119380,41 +164455,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -119425,11 +164500,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119437,13 +164512,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119469,6 +164544,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119478,6 +164554,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119496,31 +164573,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 1039 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -119534,7 +164611,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -119542,7 +164619,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -119550,37 +164627,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -119588,10 +164665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119599,11 +164676,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -119616,7 +164693,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -119631,6 +164708,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119640,6 +164718,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119658,31 +164737,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1040 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -119704,56 +164783,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119761,11 +164840,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -119793,6 +164872,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119802,6 +164882,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119820,29 +164901,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1041 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -119858,7 +164939,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -119866,56 +164947,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119923,12 +165004,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -119955,6 +165036,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119964,6 +165046,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119982,31 +165065,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + SolutionIndex: 1042 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120020,7 +165103,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -120028,7 +165111,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -120036,37 +165119,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120074,10 +165157,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120085,13 +165168,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120102,7 +165185,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -120117,6 +165200,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120126,6 +165210,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120144,31 +165229,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 1043 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120190,56 +165275,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120247,13 +165332,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120279,6 +165364,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120288,6 +165374,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120306,29 +165393,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 1044 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -120344,7 +165431,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -120352,7 +165439,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -120360,37 +165447,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120398,10 +165485,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120409,13 +165496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120451,8 +165538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120470,31 +165557,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + SolutionIndex: 1045 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120508,7 +165595,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -120516,7 +165603,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -120524,37 +165611,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120562,10 +165649,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120573,8 +165660,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120590,7 +165677,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -120615,8 +165702,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120634,31 +165721,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + SolutionIndex: 1046 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120672,15 +165759,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -120688,37 +165775,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120726,10 +165809,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120737,11 +165820,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -120753,7 +165836,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -120779,8 +165862,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120798,31 +165881,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + SolutionIndex: 1047 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120836,7 +165919,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -120844,7 +165927,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -120852,37 +165935,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120890,10 +165973,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120901,8 +165984,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120943,8 +166026,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120962,31 +166045,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + SolutionIndex: 1048 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121000,7 +166083,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121008,7 +166091,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121016,37 +166099,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -121054,10 +166137,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121065,11 +166148,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -121082,7 +166165,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -121107,8 +166190,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121126,31 +166209,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + SolutionIndex: 1049 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 7 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121164,7 +166247,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121172,7 +166255,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121180,37 +166263,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -121218,10 +166301,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121229,11 +166312,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -121271,8 +166354,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121290,31 +166373,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + SolutionIndex: 1050 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 15 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121328,7 +166411,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121336,7 +166419,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121344,37 +166427,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -121382,10 +166465,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121393,12 +166476,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -121435,8 +166518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121454,31 +166537,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + SolutionIndex: 1051 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121492,7 +166575,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121500,7 +166583,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121508,37 +166591,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -121546,10 +166629,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121557,13 +166640,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121574,7 +166657,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -121599,8 +166682,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121618,31 +166701,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + SolutionIndex: 1052 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 17 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121656,15 +166739,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121672,33 +166755,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -121706,10 +166793,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121717,13 +166804,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121733,7 +166820,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -121759,8 +166846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121778,31 +166865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + SolutionIndex: 1053 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 + SubGroup0: 8 SubGroup1: 4 - SubGroupA: 64 + SubGroupA: 8 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 17 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121816,15 +166903,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121832,31 +166919,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121869,11 +166952,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121881,13 +166964,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121897,8 +166980,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -121923,8 +167006,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121942,31 +167025,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + SolutionIndex: 1054 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -121980,7 +167063,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -121988,7 +167071,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -121996,27 +167079,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 4 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 4 - LVPA: 1 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122029,11 +167112,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122041,13 +167124,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -122083,8 +167166,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122102,31 +167185,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + SolutionIndex: 1055 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -122146,58 +167229,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122205,8 +167284,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122221,8 +167300,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -122247,8 +167326,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122266,29 +167345,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + SolutionIndex: 1056 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -122310,58 +167389,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122369,8 +167444,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122385,7 +167460,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -122411,8 +167486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122430,29 +167505,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + SolutionIndex: 1057 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 2 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -122474,58 +167549,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 4 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 64 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122533,8 +167604,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122549,8 +167620,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -122575,8 +167646,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122594,29 +167665,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + SolutionIndex: 1058 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 7 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -122632,63 +167703,59 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 4 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 2 + MacroTile0: 8 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -122697,8 +167764,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -122713,7 +167780,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -122739,8 +167806,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122758,31 +167825,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + SolutionIndex: 1059 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 7 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -122802,58 +167869,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122861,13 +167924,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -122877,8 +167940,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -122903,8 +167966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122922,29 +167985,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + SolutionIndex: 1060 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 11 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -122966,58 +168029,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123025,8 +168084,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -123041,8 +168100,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -123067,8 +168126,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123086,29 +168145,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + SolutionIndex: 1061 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -123130,58 +168189,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123189,8 +168244,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -123205,7 +168260,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -123231,8 +168286,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123250,29 +168305,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 1062 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -123294,9 +168349,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -123304,48 +168359,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 4 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123353,12 +168404,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -123369,8 +168420,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -123395,8 +168446,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123414,29 +168465,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + SolutionIndex: 1063 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -123452,63 +168503,59 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 4 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123517,11 +168564,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -123533,8 +168580,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -123559,8 +168606,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123578,31 +168625,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 1064 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -123616,15 +168663,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -123632,47 +168679,43 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 4 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123681,12 +168724,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -123697,7 +168740,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -123723,8 +168766,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123742,31 +168785,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + SolutionIndex: 1065 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -123788,55 +168831,55 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 4 LSCB: 16 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123845,12 +168888,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -123887,8 +168930,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123906,28 +168949,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + SolutionIndex: 1066 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -123952,55 +168995,55 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 4 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124009,12 +169052,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -124051,8 +169094,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124070,29 +169113,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + SolutionIndex: 1067 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -124108,64 +169151,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 16 - LSPB: 128 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1344 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124173,13 +169212,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -124189,7 +169228,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -124215,8 +169254,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124234,31 +169273,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + SolutionIndex: 1068 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 32 - SubGroupA: 16 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 32, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124272,7 +169311,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -124280,56 +169319,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124337,12 +169376,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -124379,8 +169418,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124398,31 +169437,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + SolutionIndex: 1069 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124436,15 +169475,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124452,48 +169491,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124501,10 +169540,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124520,13 +169557,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124546,8 +169582,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124565,31 +169601,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1070 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124601,7 +169639,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -124609,7 +169647,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124617,48 +169655,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124666,8 +169704,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124689,7 +169727,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124709,8 +169746,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124728,31 +169765,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1071 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124766,15 +169803,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124782,48 +169819,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124831,14 +169864,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -124849,14 +169880,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124876,8 +169906,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124895,31 +169925,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1072 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124931,7 +169963,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -124939,7 +169971,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124947,48 +169979,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124996,12 +170028,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125019,7 +170051,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125039,8 +170070,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125058,31 +170089,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1073 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -125096,64 +170127,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 4 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2240 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125161,14 +170188,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125179,14 +170204,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125206,8 +170230,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125225,31 +170249,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1074 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125261,7 +170287,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -125269,56 +170295,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125326,11 +170352,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -125349,7 +170375,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125369,8 +170394,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125388,31 +170413,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1075 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -125426,64 +170451,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125491,14 +170516,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125516,7 +170539,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125536,8 +170558,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125555,31 +170577,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1076 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125591,64 +170615,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125656,11 +170676,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -125672,14 +170692,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125699,8 +170718,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125718,31 +170737,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1077 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -125756,64 +170775,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125821,11 +170840,9 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -125846,7 +170863,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125866,8 +170882,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125885,31 +170901,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1078 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125921,60 +170939,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 4 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125982,14 +171004,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126000,14 +171020,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126027,8 +171046,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126046,31 +171065,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1079 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 2 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 2 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [2, 16, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126082,60 +171103,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126143,14 +171168,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126161,14 +171184,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126188,8 +171210,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126207,31 +171229,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1080 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126243,7 +171267,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126263,33 +171287,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126297,10 +171321,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126308,8 +171332,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -126331,7 +171355,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126351,8 +171374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126370,32 +171393,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1081 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -126408,14 +171431,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -126434,38 +171457,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126473,14 +171496,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126492,13 +171513,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126518,8 +171538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126537,31 +171557,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1082 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126573,14 +171595,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -126598,28 +171620,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126627,10 +171649,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126638,14 +171660,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126657,13 +171677,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126683,8 +171702,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126702,31 +171721,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1083 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126738,7 +171759,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126763,18 +171784,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 @@ -126784,7 +171805,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126792,10 +171813,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126803,12 +171824,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126820,13 +171841,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126846,8 +171866,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126865,31 +171885,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1084 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -126903,15 +171923,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -126919,48 +171939,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126968,13 +171988,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -126993,7 +172011,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127013,8 +172030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127032,31 +172049,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1085 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127068,7 +172087,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -127076,7 +172095,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127084,48 +172103,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127133,8 +172152,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -127156,7 +172175,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127176,8 +172194,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127195,31 +172213,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1086 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127233,15 +172251,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127249,37 +172267,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127287,10 +172305,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127298,13 +172316,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -127323,7 +172339,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127343,8 +172358,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127362,31 +172377,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1087 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127398,7 +172415,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -127406,7 +172423,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127414,25 +172431,25 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -127440,11 +172457,11 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127452,10 +172469,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127463,11 +172480,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -127486,7 +172503,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127506,8 +172522,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127525,31 +172541,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1088 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127563,49 +172579,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127614,9 +172634,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127624,13 +172644,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -127642,14 +172660,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127669,8 +172686,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127688,31 +172705,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1089 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127724,64 +172743,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127789,13 +172808,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -127814,7 +172831,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127834,8 +172850,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127853,31 +172869,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1090 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127889,7 +172907,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -127897,30 +172915,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7232 @@ -127931,11 +172949,11 @@ LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127943,10 +172961,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127954,13 +172972,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127977,7 +172995,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127997,8 +173014,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128016,31 +173033,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1091 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -128054,60 +173071,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128115,13 +173136,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -128133,14 +173152,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128160,8 +173178,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128179,31 +173197,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1092 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128215,64 +173235,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128280,13 +173300,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -128299,13 +173317,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128325,8 +173342,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128344,31 +173361,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1093 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128380,64 +173399,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128445,11 +173460,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -128461,14 +173476,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128488,8 +173502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128507,31 +173521,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1094 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -128545,60 +173559,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128606,15 +173624,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128624,14 +173640,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128651,8 +173666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128670,31 +173685,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1095 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128706,59 +173723,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -128767,15 +173788,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128785,14 +173804,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128812,8 +173830,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128831,31 +173849,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1096 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128873,37 +173893,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128918,9 +173942,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128928,15 +173952,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128946,14 +173968,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128973,8 +173994,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128992,31 +174013,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 826 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1097 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129028,43 +174051,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129077,11 +174104,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129089,15 +174116,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129107,14 +174132,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129134,8 +174158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129153,31 +174177,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 827 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1098 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129189,43 +174215,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129238,11 +174264,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129250,15 +174276,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129269,13 +174293,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129295,8 +174318,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129314,31 +174337,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 828 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1099 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129350,15 +174375,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129366,31 +174391,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCA: 32 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129403,11 +174424,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129416,14 +174437,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129433,14 +174452,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129460,8 +174478,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129479,16 +174497,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 829 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -129499,11 +174517,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129515,7 +174535,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -129524,7 +174544,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -129540,22 +174560,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129568,11 +174588,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129580,13 +174600,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129603,7 +174623,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129623,8 +174642,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129642,20 +174661,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 830 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -129663,10 +174682,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -129680,14 +174699,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -129706,21 +174725,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 64 LVCA: 16 - LVCB: 2 - LVPA: 2 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129733,7 +174752,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -129745,15 +174764,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129770,7 +174787,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129790,8 +174806,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129809,31 +174825,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 831 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129845,7 +174863,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -129854,7 +174872,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -129871,21 +174889,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCB: 32 + LSPA: 16 LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129898,11 +174916,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129914,9 +174932,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129933,7 +174951,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129953,8 +174970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129972,15 +174989,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 832 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -129993,10 +175010,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -130010,45 +175027,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130059,11 +175080,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130072,14 +175093,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130089,14 +175108,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130116,8 +175134,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130135,16 +175153,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 833 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -130155,11 +175173,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130171,7 +175191,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -130179,37 +175199,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 8 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130220,11 +175240,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130233,12 +175253,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130255,7 +175275,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130275,8 +175294,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130294,15 +175313,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 834 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -130314,11 +175333,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -130338,39 +175357,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130394,13 +175417,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130411,14 +175432,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130438,8 +175458,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130457,8 +175477,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 835 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130466,22 +175486,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130493,60 +175515,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 32 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130554,15 +175580,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130572,14 +175596,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130599,8 +175622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130618,31 +175641,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 836 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130654,49 +175679,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCB: 16 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3080 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130707,7 +175728,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -130720,9 +175741,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -130737,14 +175756,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130764,8 +175782,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130783,8 +175801,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 837 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130792,22 +175810,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130819,56 +175839,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -130880,15 +175904,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130898,14 +175920,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130925,8 +175946,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130944,8 +175965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 838 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130953,22 +175974,24 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130980,60 +176003,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 8192 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131041,15 +176064,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131066,7 +176087,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131086,8 +176106,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131105,31 +176125,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 839 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131141,60 +176163,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131202,15 +176228,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131220,14 +176244,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131247,8 +176270,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131266,31 +176289,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 840 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131302,49 +176327,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCB: 16 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3080 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131355,7 +176376,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -131368,9 +176389,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -131385,14 +176404,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131412,8 +176430,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131431,8 +176449,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 841 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131440,22 +176458,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131467,45 +176487,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -131516,11 +176536,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131529,14 +176549,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131553,7 +176571,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131573,8 +176590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131592,15 +176609,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 842 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -131608,15 +176625,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131628,49 +176647,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131679,9 +176702,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131689,15 +176712,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131707,14 +176728,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131734,8 +176754,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131753,31 +176773,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 843 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131789,60 +176811,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131850,15 +176876,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131868,14 +176892,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131895,8 +176918,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131914,31 +176937,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 844 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131950,49 +176975,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 + LSCA: 128 + LSCB: 32 + LSPA: 16 + LSPB: 64 LVCA: 64 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -132000,10 +177029,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132012,14 +177041,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132029,14 +177056,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132056,8 +177082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132075,31 +177101,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 845 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132111,60 +177139,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132172,15 +177204,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132190,14 +177220,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132217,8 +177246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132236,8 +177265,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 846 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132245,22 +177274,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132272,56 +177303,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -132333,15 +177368,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132351,14 +177384,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132378,8 +177410,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132397,8 +177429,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 847 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132406,22 +177438,24 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132439,39 +177473,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132484,9 +177522,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132494,15 +177532,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132512,14 +177548,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132539,8 +177574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132558,31 +177593,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 848 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132600,39 +177637,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132644,10 +177685,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132655,15 +177696,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132673,14 +177712,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132700,8 +177738,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132719,31 +177757,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 849 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132762,38 +177802,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 2112 LdsOffsetA: 0 LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132817,13 +177857,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -132835,13 +177873,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132861,8 +177898,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132880,8 +177917,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 850 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132896,15 +177933,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132922,39 +177961,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -132967,9 +178010,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132977,14 +178020,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -132995,14 +178036,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133022,8 +178062,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133041,8 +178081,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 851 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133050,22 +178090,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133083,39 +178125,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -133128,9 +178174,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133138,15 +178184,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133156,14 +178200,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133183,8 +178226,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133202,31 +178245,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 852 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133238,45 +178283,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -133287,11 +178336,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133299,14 +178348,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -133317,14 +178364,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133344,8 +178390,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133363,8 +178409,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 853 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133372,22 +178418,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133399,43 +178447,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -133448,11 +178500,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133462,13 +178514,11 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133478,14 +178528,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133505,8 +178554,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133524,31 +178573,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 854 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133560,49 +178611,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 16 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -133621,15 +178676,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133639,14 +178692,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133666,8 +178718,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133685,8 +178737,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 855 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 1126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133694,22 +178746,24 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133721,60 +178775,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133782,15 +178840,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133800,14 +178856,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133827,8 +178882,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133846,8 +178901,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 856 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133855,22 +178910,24 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133882,14 +178939,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -133907,24 +178964,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCA: 128 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -133932,9 +178993,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133943,15 +179004,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133961,14 +179020,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133988,8 +179046,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134007,8 +179065,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 857 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -134016,11 +179074,11 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -134028,10 +179086,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -134043,16 +179103,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -134069,23 +179129,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 + LSCB: 32 + LSPA: 32 + LSPB: 64 LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -134094,9 +179158,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134104,13 +179168,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -134120,14 +179184,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -134147,8 +179210,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134166,20 +179229,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 858 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -134187,10 +179250,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134204,15 +179267,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134220,44 +179283,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134265,13 +179332,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -134281,7 +179348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -134307,8 +179374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134326,31 +179393,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 859 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134364,15 +179431,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134380,29 +179447,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -134413,11 +179484,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134425,13 +179496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -134441,8 +179512,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -134467,8 +179538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134486,31 +179557,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 860 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134524,60 +179595,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134585,13 +179660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -134601,8 +179676,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -134627,8 +179702,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134646,31 +179721,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 861 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134684,15 +179759,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134700,33 +179775,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -134734,10 +179813,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134745,13 +179824,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -134761,7 +179840,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -134787,8 +179866,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134806,31 +179885,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 862 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134844,60 +179923,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134905,13 +179988,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -134921,8 +180004,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -134947,8 +180030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134966,31 +180049,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 863 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135004,15 +180087,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -135020,33 +180103,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -135054,10 +180141,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135065,13 +180152,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135081,8 +180168,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -135107,8 +180194,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135126,31 +180213,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 864 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135164,7 +180251,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135172,41 +180259,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -135217,11 +180304,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135229,13 +180316,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135271,8 +180358,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135290,31 +180377,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 865 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135328,45 +180415,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -135377,11 +180468,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135389,13 +180480,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135405,7 +180496,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -135431,8 +180522,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135450,31 +180541,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 866 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135488,15 +180579,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -135504,33 +180595,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCA: 128 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -135538,10 +180633,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135549,13 +180644,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135565,8 +180660,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -135591,8 +180686,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135610,31 +180705,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 867 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 1138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135648,7 +180743,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135673,34 +180768,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -135709,12 +180804,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -135751,8 +180846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135770,20 +180865,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 868 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -135791,10 +180886,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135808,7 +180903,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -135833,34 +180928,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -135869,12 +180964,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -135911,8 +181006,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135930,20 +181025,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 869 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -135951,10 +181046,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -135968,60 +181063,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136029,12 +181128,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136045,8 +181144,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -136071,8 +181170,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136090,31 +181189,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 870 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -136128,7 +181227,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -136148,40 +181247,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136189,8 +181288,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -136206,7 +181305,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -136231,8 +181330,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136250,31 +181349,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 871 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -136288,13 +181387,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -136314,38 +181413,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136353,11 +181448,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -136369,7 +181464,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -136395,8 +181490,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136414,31 +181509,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 872 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -136452,13 +181547,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -136472,40 +181567,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136513,13 +181612,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136529,7 +181628,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -136555,8 +181654,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136574,31 +181673,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 873 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -136612,13 +181711,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -136632,33 +181731,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -136666,10 +181761,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136677,13 +181772,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136693,7 +181788,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -136719,8 +181814,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136738,31 +181833,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 874 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -136776,13 +181871,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -136796,29 +181891,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -136826,10 +181925,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136837,13 +181936,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136853,7 +181952,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -136879,8 +181978,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136898,31 +181997,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 875 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -136936,13 +182035,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -136953,32 +182052,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136989,11 +182084,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137001,13 +182096,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -137017,7 +182112,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -137043,8 +182138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137062,31 +182157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 876 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137134,7 +182229,7 @@ LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1056 + LdsNumElements: 4096 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -137142,18 +182237,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137161,12 +182256,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137203,8 +182298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137222,29 +182317,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 877 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -137266,7 +182361,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137294,22 +182389,26 @@ LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -137337,8 +182436,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -137363,8 +182462,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137382,20 +182481,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 878 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -137403,8 +182502,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -137426,7 +182525,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137454,22 +182553,26 @@ LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -137497,7 +182600,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -137523,8 +182626,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137542,20 +182645,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 879 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -137563,8 +182666,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -137580,13 +182683,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137606,34 +182709,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 544 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137646,8 +182753,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -137657,7 +182764,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -137683,8 +182790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137702,31 +182809,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 880 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137746,54 +182853,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137801,8 +182912,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -137817,7 +182928,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -137843,8 +182954,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137862,28 +182973,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 881 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -137908,56 +183019,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137965,8 +183076,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -138007,8 +183118,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138026,29 +183137,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 882 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -138064,7 +183175,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -138072,33 +183183,33 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 @@ -138106,22 +183217,22 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138129,12 +183240,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138171,8 +183282,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138190,31 +183301,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 883 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -138236,56 +183347,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138293,12 +183404,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138310,7 +183421,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -138335,8 +183446,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138354,28 +183465,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 884 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138398,54 +183509,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138453,12 +183568,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138469,7 +183584,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -138495,8 +183610,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138514,29 +183629,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 885 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -138560,41 +183675,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -138606,10 +183721,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138617,7 +183732,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -138659,8 +183774,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138678,8 +183793,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 886 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -138688,19 +183803,19 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -138716,7 +183831,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -138742,21 +183857,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -138769,11 +183884,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138781,11 +183896,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138798,7 +183913,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -138823,8 +183938,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138842,8 +183957,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 887 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -138852,11 +183967,11 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -138866,7 +183981,7 @@ WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -138880,7 +183995,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -138906,17 +184021,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -138929,11 +184044,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138941,11 +184056,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138983,8 +184098,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139002,8 +184117,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 888 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -139012,11 +184127,11 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -139024,9 +184139,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -139040,7 +184155,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -139066,21 +184181,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -139093,11 +184208,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139105,11 +184220,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -139147,8 +184262,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139166,8 +184281,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 889 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -139176,11 +184291,11 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -139188,9 +184303,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -139229,22 +184344,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -139258,9 +184373,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139269,11 +184384,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -139286,7 +184401,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -139311,8 +184426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139330,8 +184445,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 890 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -139340,10 +184455,10 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -139374,7 +184489,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -139393,18 +184508,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -139418,9 +184537,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139429,11 +184548,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -139445,7 +184564,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -139471,8 +184590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139490,8 +184609,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 891 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -139499,11 +184618,11 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -139512,7 +184631,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139553,22 +184672,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -139582,9 +184701,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139593,11 +184712,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -139635,8 +184754,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139654,8 +184773,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 892 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -139664,10 +184783,10 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -139676,7 +184795,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139692,7 +184811,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -139700,7 +184819,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -139708,37 +184827,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -139746,10 +184865,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139757,13 +184876,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139799,8 +184918,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139818,31 +184937,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 893 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -139856,7 +184975,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -139864,7 +184983,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -139872,44 +184991,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -139922,11 +185041,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -139963,8 +185082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139982,31 +185101,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 894 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 1165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140020,15 +185139,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -140036,33 +185155,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140070,10 +185193,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140081,8 +185204,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -140097,7 +185220,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -140123,8 +185246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140142,31 +185265,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 895 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140180,7 +185303,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -140188,45 +185311,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140234,10 +185357,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140245,8 +185368,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -140287,8 +185410,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140306,31 +185429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 896 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140350,54 +185473,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140405,8 +185532,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -140421,7 +185548,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -140447,8 +185574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140466,29 +185593,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 897 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140512,7 +185639,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -140520,33 +185647,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -140558,10 +185685,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140569,13 +185696,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -140611,8 +185738,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140630,29 +185757,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 898 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140676,41 +185803,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -140722,9 +185849,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -140733,12 +185860,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -140750,7 +185877,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -140775,8 +185902,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140794,8 +185921,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 899 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -140804,19 +185931,19 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140840,55 +185967,55 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -140897,12 +186024,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -140939,8 +186066,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140958,28 +186085,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 900 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -141004,37 +186131,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 8 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 832 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -141046,10 +186173,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141057,13 +186184,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141099,8 +186226,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141118,28 +186245,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 901 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -141164,41 +186291,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 8 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -141210,10 +186337,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141221,13 +186348,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141263,8 +186390,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141282,28 +186409,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 902 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 4] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -141328,56 +186455,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141385,13 +186512,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141427,8 +186554,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141446,8 +186573,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 903 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -141455,20 +186582,20 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -141492,56 +186619,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141549,11 +186676,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -141566,7 +186693,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -141591,8 +186718,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141610,29 +186737,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 904 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -141656,41 +186783,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -141702,10 +186829,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141713,7 +186840,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -141755,8 +186882,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141774,8 +186901,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 905 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -141783,20 +186910,20 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -141818,7 +186945,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -141837,38 +186964,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 16 - LVPA: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -141877,11 +187000,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -141893,7 +187016,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -141919,8 +187042,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141938,20 +187061,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 906 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -141959,7 +187082,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -141982,7 +187105,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -141996,44 +187119,40 @@ GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 8 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142041,13 +187160,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142057,8 +187176,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -142083,8 +187202,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142102,29 +187221,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 907 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -142160,27 +187279,27 @@ GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -142194,10 +187313,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142205,13 +187324,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142247,8 +187366,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142266,29 +187385,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 908 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -142329,22 +187448,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -142358,10 +187477,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142369,13 +187488,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142411,8 +187530,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142430,20 +187549,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 909 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -142451,8 +187570,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -142468,13 +187587,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -142485,47 +187604,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142533,7 +187648,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -142549,7 +187664,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -142575,8 +187690,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142594,15 +187709,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 910 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -142615,10 +187730,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142632,7 +187747,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142649,32 +187764,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142682,10 +187797,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142693,8 +187808,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -142735,8 +187850,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142754,15 +187869,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -142775,10 +187890,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142792,13 +187907,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -142809,47 +187924,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142857,8 +187968,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -142873,8 +187984,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -142899,8 +188010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142918,15 +188029,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -142939,10 +188050,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142956,13 +188067,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -142973,7 +188084,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -142982,38 +188093,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143021,11 +188128,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -143037,8 +188144,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -143063,8 +188170,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143082,20 +188189,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -143103,10 +188210,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143120,13 +188227,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143137,7 +188244,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -143146,27 +188253,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 528 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -143175,9 +188278,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143185,11 +188288,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -143201,7 +188304,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -143227,8 +188330,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143246,20 +188349,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -143267,10 +188370,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143284,13 +188387,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143301,7 +188404,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -143310,27 +188413,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -143354,7 +188453,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -143365,8 +188464,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -143391,8 +188490,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143410,31 +188509,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143448,13 +188547,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143465,36 +188564,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -143503,9 +188598,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143513,13 +188608,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143529,8 +188624,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -143555,8 +188650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143574,15 +188669,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -143595,10 +188690,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143612,7 +188707,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -143629,36 +188724,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 + LdsNumElements: 3104 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -143666,10 +188761,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143677,13 +188772,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143719,8 +188814,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143738,20 +188833,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -143759,10 +188854,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143801,20 +188896,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -143826,10 +188921,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143837,13 +188932,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143879,8 +188974,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143898,29 +188993,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -143936,7 +189031,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -143956,40 +189051,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143997,13 +189092,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144039,8 +189134,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144058,31 +189153,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 + SolutionIndex: 1190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144096,13 +189191,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144121,20 +189216,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144145,11 +189244,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144173,8 +189272,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144199,8 +189298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144218,16 +189317,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -144239,10 +189338,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144256,13 +189355,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144276,25 +189375,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144305,11 +189408,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144317,11 +189420,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -144333,7 +189436,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144359,8 +189462,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144378,20 +189481,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -144399,10 +189502,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144416,13 +189519,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144441,20 +189544,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144465,11 +189572,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144481,8 +189588,8 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -144493,8 +189600,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144519,8 +189626,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144538,16 +189645,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -144559,10 +189666,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144576,13 +189683,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144601,18 +189708,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -144625,11 +189736,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 64 - MacroTileA: 8 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144641,7 +189752,7 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -144653,7 +189764,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144679,8 +189790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144698,16 +189809,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [2, 4] ThreadTile0: 2 ThreadTile1: 4 @@ -144719,10 +189830,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144736,15 +189847,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -144752,44 +189863,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144798,12 +189913,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144813,8 +189928,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144839,8 +189954,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144858,31 +189973,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144896,15 +190011,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -144912,43 +190027,47 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144957,13 +190076,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144973,8 +190092,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144999,8 +190118,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145018,31 +190137,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [32, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145056,15 +190175,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -145072,29 +190191,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145105,11 +190228,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145117,11 +190240,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -145133,7 +190256,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -145159,8 +190282,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145178,31 +190301,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145216,15 +190339,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -145232,29 +190355,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145265,11 +190392,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 64 - MacroTileA: 8 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145277,12 +190404,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145293,8 +190420,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145319,8 +190446,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145338,31 +190465,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145376,15 +190503,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -145392,44 +190519,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145437,13 +190568,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145453,8 +190584,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145479,8 +190610,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145498,37 +190629,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -145542,53 +190673,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 4 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145597,12 +190733,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145613,13 +190751,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -145639,8 +190779,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145658,37 +190798,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -145704,55 +190842,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 4 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145761,12 +190900,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145777,6 +190916,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -145784,6 +190924,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -145803,8 +190944,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145822,29 +190963,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -145852,7 +190993,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -145860,7 +191001,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145880,43 +191021,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145925,12 +191067,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145941,13 +191083,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -145967,8 +191111,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145986,37 +191130,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 + SolutionIndex: 1202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -146024,13 +191168,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -146044,29 +191188,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1344 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -146074,10 +191223,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146085,12 +191234,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146101,13 +191250,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -146127,8 +191278,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146146,37 +191297,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -146184,7 +191335,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -146208,29 +191359,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -146238,10 +191390,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146249,12 +191401,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146265,6 +191417,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -146272,6 +191425,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -146291,8 +191445,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146310,37 +191464,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -146348,7 +191502,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -146368,44 +191522,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146413,11 +191568,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -146429,13 +191584,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -146455,8 +191612,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146474,37 +191631,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -146512,14 +191669,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -146532,44 +191689,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146577,12 +191735,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146593,6 +191753,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -146600,6 +191761,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -146619,8 +191781,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146638,37 +191800,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -146676,60 +191836,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146737,12 +191902,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -146753,13 +191918,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -146779,8 +191946,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146798,37 +191965,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -146836,64 +192003,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146901,11 +192069,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -146917,6 +192087,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -146924,6 +192095,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -146943,8 +192115,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146962,37 +192134,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -147000,14 +192170,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -147020,29 +192190,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 4 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 2 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2240 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -147050,9 +192225,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 4 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -147061,12 +192236,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147077,13 +192254,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -147103,8 +192282,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147122,37 +192301,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -147160,14 +192337,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -147184,29 +192361,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -147214,10 +192392,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147225,12 +192403,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147241,6 +192421,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -147248,6 +192429,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -147267,8 +192449,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147286,37 +192468,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 + SolutionIndex: 1210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -147324,15 +192504,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -147340,48 +192520,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147389,8 +192570,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -147405,13 +192588,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -147431,8 +192616,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147450,37 +192635,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 + SolutionIndex: 1211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -147488,15 +192671,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -147504,44 +192687,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147549,12 +192737,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147565,13 +192755,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -147591,8 +192783,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147610,37 +192802,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -147648,15 +192838,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -147664,48 +192854,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147713,12 +192904,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147729,6 +192922,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -147736,6 +192930,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -147755,8 +192950,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147774,37 +192969,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -147812,64 +193005,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147877,12 +193071,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147893,6 +193089,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -147900,6 +193097,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -147919,8 +193117,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147938,37 +193136,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 + SolutionIndex: 1214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 2 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 16, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -147976,7 +193172,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -147996,23 +193192,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -148022,7 +193219,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -148030,10 +193227,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148041,8 +193238,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -148057,13 +193254,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -148083,8 +193282,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148102,37 +193301,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -148140,7 +193339,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148160,33 +193359,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -148194,10 +193394,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148205,8 +193405,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -148221,6 +193421,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -148228,6 +193429,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -148247,8 +193449,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148266,37 +193468,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -148304,7 +193506,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148312,7 +193514,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -148320,37 +193522,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -148358,10 +193561,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148369,11 +193572,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148385,13 +193588,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -148411,8 +193616,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148430,37 +193635,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 + SolutionIndex: 1217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -148468,7 +193673,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148476,7 +193681,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -148484,48 +193689,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148533,11 +193739,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148549,13 +193755,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -148575,8 +193783,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148594,37 +193802,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 + SolutionIndex: 1218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -148632,7 +193840,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148640,7 +193848,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -148648,48 +193856,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148697,12 +193906,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -148713,6 +193922,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -148720,6 +193930,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -148739,8 +193950,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148758,37 +193969,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -148796,7 +194007,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148804,7 +194015,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -148812,48 +194023,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148861,11 +194073,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148877,6 +194089,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -148884,6 +194097,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -148903,8 +194117,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148922,37 +194136,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 + SolutionIndex: 1220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -148960,7 +194174,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148968,56 +194182,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149025,11 +194240,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -149041,6 +194256,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -149048,6 +194264,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -149067,8 +194284,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149086,37 +194303,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -149124,53 +194341,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149178,10 +194396,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149189,12 +194407,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149205,6 +194425,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -149212,6 +194433,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -149231,8 +194453,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149250,37 +194472,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 + SolutionIndex: 1222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -149288,14 +194508,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -149312,29 +194532,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149342,10 +194563,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149353,12 +194574,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149369,13 +194592,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -149395,8 +194620,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149414,37 +194639,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 + SolutionIndex: 1223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -149452,14 +194675,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -149476,40 +194699,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149517,12 +194741,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149533,6 +194759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -149540,6 +194767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -149559,8 +194787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149578,37 +194806,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 + SolutionIndex: 1224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -149616,64 +194842,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149681,8 +194908,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -149697,13 +194926,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -149723,8 +194954,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149742,37 +194973,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -149780,64 +195009,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149845,13 +195075,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -149861,13 +195093,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -149887,8 +195121,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149906,37 +195140,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 + SolutionIndex: 1226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -149944,53 +195176,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149998,10 +195231,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150009,11 +195242,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -150025,13 +195260,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -150051,8 +195288,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150070,37 +195307,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -150108,64 +195343,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150173,8 +195409,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150189,13 +195427,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -150215,8 +195455,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150234,37 +195474,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -150272,60 +195510,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150333,12 +195576,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -150349,13 +195594,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -150375,8 +195622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150394,37 +195641,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -150432,7 +195677,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150440,56 +195685,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150497,8 +195743,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150513,13 +195759,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -150539,8 +195787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150558,37 +195806,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -150596,7 +195844,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150604,56 +195852,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150661,8 +195910,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150677,6 +195926,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -150684,6 +195934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -150703,8 +195954,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150722,37 +195973,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -150766,41 +196017,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -150814,10 +196062,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150825,8 +196073,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150841,13 +196091,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -150867,8 +196119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150886,23 +196138,23 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -150911,12 +196163,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -150924,14 +196174,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -150948,40 +196198,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150989,12 +196240,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151005,6 +196258,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -151012,6 +196266,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -151031,8 +196286,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151050,37 +196305,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -151088,13 +196341,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151112,36 +196365,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151149,12 +196407,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151165,13 +196423,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -151191,8 +196451,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151210,37 +196470,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -151248,13 +196508,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151272,25 +196532,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -151298,10 +196563,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151309,12 +196574,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151325,13 +196590,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -151351,8 +196618,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151370,37 +196637,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -151408,53 +196675,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -151462,10 +196730,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151473,12 +196741,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151489,6 +196759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -151496,6 +196767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -151515,8 +196787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151534,37 +196806,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -151579,9 +196849,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -151592,44 +196862,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151637,8 +196908,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -151653,6 +196926,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -151660,6 +196934,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -151679,8 +196954,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151698,37 +196973,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -151736,16 +197009,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -151760,40 +197033,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151801,12 +197075,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151817,6 +197093,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -151824,6 +197101,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -151843,8 +197121,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151862,16 +197140,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -151883,16 +197161,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -151900,64 +197176,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151965,11 +197242,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -151981,6 +197260,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -151988,6 +197268,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -152007,8 +197288,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152026,16 +197307,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -152046,17 +197327,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -152064,13 +197343,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -152088,36 +197367,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152125,12 +197409,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -152141,13 +197425,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -152167,8 +197453,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152186,15 +197472,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -152207,16 +197493,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -152224,7 +197510,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152232,7 +197518,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -152240,37 +197526,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -152279,9 +197566,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152289,8 +197576,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152305,6 +197592,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -152312,6 +197600,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -152331,8 +197620,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152350,16 +197639,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -152370,17 +197659,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -152388,7 +197677,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152396,52 +197685,53 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -152453,13 +197743,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152469,6 +197759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -152476,6 +197767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -152495,8 +197787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152514,8 +197806,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1242 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -152523,7 +197815,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -152534,17 +197826,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -152552,60 +197844,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152613,12 +197910,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -152629,13 +197928,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -152655,8 +197956,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152674,37 +197975,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1243 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -152712,47 +198011,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152765,10 +198065,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -152777,13 +198077,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152793,6 +198095,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -152800,6 +198103,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -152819,8 +198123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152838,37 +198142,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1244 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -152876,43 +198178,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -152925,11 +198232,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152937,13 +198244,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152953,13 +198262,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -152979,8 +198290,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152998,37 +198309,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1245 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -153036,7 +198345,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -153060,17 +198369,18 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3648 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 @@ -153089,11 +198399,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153101,13 +198411,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153117,6 +198427,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -153124,6 +198435,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -153143,8 +198455,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153162,37 +198474,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1246 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -153200,23 +198512,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -153224,36 +198536,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153261,12 +198578,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153277,13 +198594,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -153303,8 +198622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153322,37 +198641,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1247 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -153360,23 +198679,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -153384,35 +198703,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -153421,11 +198745,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -153437,13 +198763,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -153463,8 +198791,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153482,14 +198810,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1248 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -153503,16 +198831,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -153527,9 +198853,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -153540,27 +198866,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 16 - LVPB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -153574,10 +198901,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153585,13 +198912,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153601,6 +198930,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -153608,6 +198938,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -153627,8 +198958,175 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1249 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153646,37 +199144,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 + SolutionIndex: 1250 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -153691,40 +199187,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -153738,10 +199235,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153749,13 +199246,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153765,6 +199264,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -153772,6 +199272,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -153791,8 +199292,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153810,37 +199311,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1251 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -153855,7 +199354,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -153872,40 +199371,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 64 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153913,13 +199413,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153929,13 +199431,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -153955,8 +199459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153974,20 +199478,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 + SolutionIndex: 1252 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -153995,16 +199499,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -154012,49 +199514,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154065,11 +199568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154078,12 +199581,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154093,6 +199598,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -154100,6 +199606,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -154119,8 +199626,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154138,16 +199645,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 + SolutionIndex: 1253 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 4 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -154158,17 +199665,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -154176,53 +199681,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -154231,9 +199733,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154241,13 +199743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154257,13 +199762,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -154283,8 +199790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154302,37 +199809,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1254 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -154340,49 +199845,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154393,7 +199895,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -154407,11 +199909,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154421,13 +199926,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -154447,8 +199954,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154466,8 +199973,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1255 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -154475,14 +199982,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -154490,13 +199997,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -154511,7 +200016,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -154528,25 +200033,26 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154558,9 +200064,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -154569,13 +200075,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154585,6 +200094,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -154592,6 +200102,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -154611,8 +200122,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154630,8 +200141,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1256 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -154639,11 +200150,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -154655,12 +200166,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -154674,8 +200183,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -154692,6 +200201,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 @@ -154702,11 +200212,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154731,11 +200245,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154745,13 +200262,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -154771,8 +200290,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154790,8 +200309,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1257 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -154815,12 +200334,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -154828,7 +200345,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -154836,41 +200353,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154881,7 +200399,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -154894,12 +200412,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154909,6 +200428,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -154916,6 +200436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -154935,8 +200456,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154954,8 +200475,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1258 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -154963,7 +200484,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -154974,17 +200495,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -155016,6 +200537,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 @@ -155026,15 +200548,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155047,9 +200569,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155057,13 +200579,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155073,6 +200596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -155080,6 +200604,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -155099,8 +200624,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155118,8 +200643,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 + SolutionIndex: 1259 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -155127,12 +200652,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -155140,7 +200665,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -155148,7 +200673,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -155163,42 +200688,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155210,9 +200736,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155221,13 +200747,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155237,6 +200766,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -155244,6 +200774,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -155263,8 +200794,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155282,8 +200813,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1260 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -155291,28 +200822,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -155320,7 +200849,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -155328,41 +200857,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155373,10 +200903,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155385,13 +200915,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155401,6 +200932,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -155408,6 +200940,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -155427,8 +200960,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155446,8 +200979,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1261 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -155455,28 +200988,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -155484,15 +201017,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155500,37 +201033,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155538,10 +201072,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155549,13 +201083,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155565,6 +201102,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -155572,6 +201110,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -155591,8 +201130,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155610,37 +201149,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1262 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -155648,16 +201185,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -155672,29 +201209,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155702,9 +201240,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155713,13 +201251,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155729,6 +201270,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -155736,6 +201278,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -155755,8 +201298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155774,37 +201317,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 + SolutionIndex: 1263 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -155812,14 +201353,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -155836,29 +201377,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155867,9 +201409,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155877,13 +201419,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155893,6 +201438,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -155900,6 +201446,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -155919,8 +201466,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155938,16 +201485,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1264 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -155959,16 +201506,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -155976,14 +201521,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -156000,39 +201545,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -156041,13 +201587,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156057,6 +201606,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -156064,6 +201614,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -156083,8 +201634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156102,16 +201653,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 + SolutionIndex: 1265 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -156123,16 +201674,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -156140,7 +201689,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -156164,36 +201713,37 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 32 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -156205,13 +201755,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156221,6 +201772,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -156228,6 +201780,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -156247,8 +201800,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156266,20 +201819,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 + SolutionIndex: 1266 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -156287,16 +201840,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -156304,7 +201857,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -156328,29 +201881,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -156358,10 +201912,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156369,13 +201923,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156385,6 +201940,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -156392,6 +201948,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -156411,8 +201968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156430,20 +201987,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 + SolutionIndex: 1267 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -156451,16 +202008,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -156468,53 +202025,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -156522,10 +202076,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156533,13 +202087,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156549,13 +202106,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -156575,8 +202134,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156594,37 +202153,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 + SolutionIndex: 1268 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -156632,64 +202189,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 784 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156698,12 +202252,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156713,13 +202270,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -156739,8 +202298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156758,37 +202317,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 + SolutionIndex: 1269 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -156796,7 +202353,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -156804,7 +202361,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -156816,29 +202373,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156849,10 +202407,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -156861,13 +202419,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156877,6 +202436,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -156884,6 +202444,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -156903,8 +202464,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156922,8 +202483,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1270 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -156931,28 +202492,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -156960,64 +202521,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157025,13 +202583,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157041,13 +202602,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157067,8 +202630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157086,37 +202649,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 + SolutionIndex: 1271 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157124,60 +202685,57 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -157189,13 +202747,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157205,13 +202766,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157231,8 +202794,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157250,37 +202813,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 + SolutionIndex: 1272 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157294,8 +202855,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -157312,36 +202873,33 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -157353,13 +202911,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157369,13 +202930,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157395,8 +202958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157414,20 +202977,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 + SolutionIndex: 1273 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -157435,16 +202998,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157452,16 +203013,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -157476,29 +203037,26 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -157506,9 +203064,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157517,13 +203075,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157533,13 +203092,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157559,8 +203120,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157578,16 +203139,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 + SolutionIndex: 1274 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -157599,16 +203160,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157616,60 +203177,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 + LSCA: 32 + LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157677,13 +203239,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157693,13 +203258,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157719,8 +203286,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157738,37 +203305,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1275 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157776,15 +203341,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -157792,44 +203357,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 8 - LVCB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157837,13 +203407,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157853,13 +203426,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157879,8 +203454,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157898,15 +203473,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1276 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -157918,17 +203493,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157936,7 +203509,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157945,7 +203518,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -157960,25 +203533,26 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -157989,11 +203563,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158001,13 +203575,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -158017,6 +203592,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -158024,6 +203600,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158043,8 +203620,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158062,20 +203639,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1277 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158083,16 +203660,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158100,60 +203677,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158161,13 +203743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -158177,13 +203762,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158203,8 +203790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158222,37 +203809,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1278 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158260,60 +203845,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 + LSCB: 32 LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158321,13 +203911,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -158337,13 +203930,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158363,8 +203958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158382,37 +203977,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1279 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158420,7 +204013,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158444,40 +204037,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158485,13 +204079,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -158501,6 +204096,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -158508,6 +204104,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158527,8 +204124,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158546,20 +204143,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1280 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158567,16 +204164,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158584,15 +204181,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -158600,33 +204197,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -158634,10 +204236,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158645,13 +204247,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -158661,13 +204266,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158687,8 +204294,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158706,15 +204313,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1281 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -158726,13 +204333,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -158744,53 +204349,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -158798,9 +204400,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158809,13 +204411,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -158825,13 +204430,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158851,8 +204458,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158870,37 +204477,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1282 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158908,13 +204513,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -158924,7 +204529,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -158932,35 +204537,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158969,13 +204579,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -158985,13 +204596,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159011,8 +204624,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159030,14 +204643,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1283 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -159051,10 +204664,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -159068,49 +204681,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 16 + LVCA: 128 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159118,9 +204736,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159129,13 +204747,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -159145,13 +204766,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159171,8 +204794,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159190,37 +204813,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1284 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159228,14 +204849,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -159252,29 +204873,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159282,9 +204904,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159293,13 +204915,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -159309,6 +204934,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159316,6 +204942,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159335,8 +204962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159354,20 +204981,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1285 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159375,12 +205002,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -159392,53 +205017,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159447,9 +205069,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159457,13 +205079,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -159473,13 +205098,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159499,8 +205126,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159518,33 +205145,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1286 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -159556,63 +205181,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159621,13 +205247,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -159637,13 +205266,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159663,8 +205294,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159682,33 +205313,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1287 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -159720,53 +205349,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 520 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159774,10 +205400,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159785,13 +205411,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -159801,13 +205430,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159827,8 +205458,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159846,33 +205477,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1288 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -159890,58 +205519,55 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159949,13 +205575,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -159965,13 +205594,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159991,8 +205622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160010,33 +205641,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1289 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -160048,53 +205677,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -160102,9 +205732,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -160113,13 +205743,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -160129,13 +205762,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160155,8 +205790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160174,37 +205809,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1290 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160212,49 +205845,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -160265,11 +205899,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160277,13 +205911,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -160293,6 +205930,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -160300,6 +205938,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160319,8 +205958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160338,33 +205977,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1291 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -160382,43 +206019,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -160430,10 +206064,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160442,12 +206076,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -160457,13 +206094,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160483,8 +206122,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160502,33 +206141,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1292 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -160540,47 +206177,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -160593,10 +206227,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -160605,13 +206239,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -160621,13 +206258,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160647,8 +206286,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160666,37 +206305,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1293 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160704,7 +206341,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160712,7 +206349,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160720,33 +206357,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -160757,11 +206395,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160770,12 +206408,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -160785,6 +206424,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -160792,6 +206432,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160811,8 +206452,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160830,16 +206471,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1294 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -160850,17 +206491,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160874,68 +206515,78 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -160945,13 +206596,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160971,9 +206624,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -160990,15 +206644,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1295 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -161010,17 +206664,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161035,71 +206687,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -161109,6 +206767,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161116,6 +206775,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161135,9 +206795,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -161154,16 +206815,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1296 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -161174,17 +206835,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161199,71 +206858,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -161273,6 +206938,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161280,6 +206946,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161299,9 +206966,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -161318,16 +206986,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1297 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -161338,17 +207006,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161356,7 +207022,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161364,70 +207030,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -161437,6 +207107,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161444,6 +207115,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161463,9 +207135,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -161482,16 +207155,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1298 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -161502,17 +207175,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161520,7 +207193,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161528,70 +207201,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -161601,6 +207278,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161608,6 +207286,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161627,9 +207306,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -161646,16 +207326,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1299 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -161666,17 +207346,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161684,7 +207364,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161692,8 +207372,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -161704,33 +207384,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161739,23 +207420,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -161765,13 +207449,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161791,9 +207477,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -161810,37 +207497,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1300 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161848,7 +207535,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -161856,8 +207543,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -161868,33 +207555,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161903,23 +207591,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -161929,6 +207620,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161936,6 +207628,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161955,9 +207648,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -161974,37 +207668,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1301 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162012,7 +207706,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162020,8 +207714,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -162032,33 +207726,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162067,23 +207762,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162093,6 +207791,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162100,6 +207799,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162119,9 +207819,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -162138,37 +207839,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1302 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162176,16 +207877,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -162200,29 +207901,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162230,24 +207932,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162257,13 +207964,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162283,9 +207992,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -162302,37 +208012,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 1303 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162340,16 +208048,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -162364,54 +208072,60 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162421,13 +208135,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162447,9 +208163,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -162466,20 +208183,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1304 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -162487,16 +208204,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162504,14 +208219,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -162528,29 +208243,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162559,23 +208275,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162585,6 +208306,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162592,6 +208314,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162611,9 +208334,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -162630,37 +208354,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + SolutionIndex: 1305 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162668,16 +208390,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -162692,29 +208414,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162722,24 +208445,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162749,6 +208477,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162756,6 +208485,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162775,9 +208505,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -162794,20 +208525,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 + SolutionIndex: 1306 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -162815,16 +208546,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162832,14 +208561,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -162856,29 +208585,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162886,24 +208616,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162913,6 +208648,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162920,6 +208656,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162939,9 +208676,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -162958,20 +208696,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1307 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -162979,16 +208717,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162996,49 +208732,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -163046,24 +208787,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163073,13 +208817,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163099,9 +208845,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -163118,37 +208865,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1308 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163156,7 +208903,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -163165,7 +208912,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -163180,29 +208927,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -163210,24 +208958,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163237,13 +208988,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163263,9 +209016,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -163282,20 +209036,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1309 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -163303,16 +209057,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163320,7 +209074,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -163329,7 +209083,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -163344,29 +209098,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -163374,24 +209129,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163401,13 +209159,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163427,9 +209187,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -163446,16 +209207,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1310 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -163467,16 +209228,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163484,7 +209245,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -163493,7 +209254,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -163508,54 +209269,58 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163565,6 +209330,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163572,6 +209338,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163591,9 +209358,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -163610,20 +209378,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1311 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -163631,16 +209399,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163648,7 +209416,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -163657,7 +209425,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -163672,29 +209440,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -163702,24 +209471,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163729,6 +209501,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163736,6 +209509,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163755,9 +209529,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -163773,21 +209548,21 @@ ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 1040 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + ScheduleLocalWrite: 1 + SolutionIndex: 1312 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -163795,16 +209570,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163812,74 +209587,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -163889,13 +209674,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163915,9 +209702,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -163934,15 +209722,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1313 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -163954,17 +209742,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163978,9 +209764,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -163988,58 +209774,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164049,13 +209845,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164075,9 +209873,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -164094,15 +209893,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1314 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -164114,17 +209913,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164140,7 +209937,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -164148,33 +209945,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -164186,24 +209984,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164213,6 +210014,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -164220,6 +210022,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164239,9 +210042,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -164258,15 +210062,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1315 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -164278,8 +210082,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -164288,7 +210092,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164296,53 +210100,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164350,24 +210155,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164377,6 +210187,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -164384,6 +210195,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164403,15 +210215,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -164422,15 +210236,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1316 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -164438,21 +210252,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164466,68 +210278,78 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164537,13 +210359,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164563,15 +210387,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -164582,37 +210408,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1317 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164620,45 +210444,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -164669,25 +210494,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164697,6 +210527,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -164704,6 +210535,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164723,15 +210555,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -164742,8 +210576,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1318 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -164752,27 +210586,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164786,37 +210618,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -164830,24 +210667,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -164857,13 +210699,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164883,15 +210727,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -164902,8 +210748,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1319 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -164912,27 +210758,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164946,37 +210790,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -164990,24 +210839,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165017,13 +210871,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165043,15 +210899,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -165062,8 +210920,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1320 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -165072,27 +210930,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165100,45 +210956,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 528 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -165149,25 +211006,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165177,6 +211039,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: @@ -165184,6 +211047,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165203,15 +211067,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -165222,8 +211088,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1321 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -165232,27 +211098,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165266,68 +211130,78 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165337,13 +211211,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165363,15 +211239,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -165382,37 +211260,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1322 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165420,13 +211296,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -165444,25 +211320,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165470,24 +211351,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165497,13 +211381,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165523,15 +211409,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -165542,37 +211430,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1323 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165588,70 +211476,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165661,6 +211553,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -165668,6 +211561,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165687,15 +211581,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -165706,28 +211602,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1324 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -165736,7 +211632,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165750,39 +211646,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -165794,24 +211695,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165821,13 +211725,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165847,15 +211753,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -165866,8 +211774,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1325 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -165876,19 +211784,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -165896,7 +211804,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165904,49 +211812,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165954,24 +211867,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -165981,13 +211899,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166007,15 +211927,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -166026,37 +211948,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1326 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166064,53 +211984,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3616 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166118,24 +212039,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166145,13 +212071,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166171,15 +212099,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -166190,37 +212120,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1327 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166228,78 +212156,80 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166309,13 +212239,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166335,15 +212267,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -166354,37 +212288,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1328 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166392,53 +212324,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166446,24 +212379,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166473,6 +212411,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166480,6 +212419,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166499,15 +212439,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -166518,37 +212460,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 + SolutionIndex: 1329 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166556,14 +212496,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -166580,29 +212520,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166610,24 +212551,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166637,6 +212583,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166644,6 +212591,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166663,15 +212611,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -166682,20 +212632,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 + SolutionIndex: 1330 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -166703,16 +212653,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166720,15 +212668,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -166736,62 +212684,64 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166801,13 +212751,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166827,15 +212779,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -166846,37 +212800,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 + SolutionIndex: 1331 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166884,14 +212836,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -166908,54 +212860,60 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166965,13 +212923,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166991,15 +212951,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -167010,20 +212972,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 + SolutionIndex: 1332 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -167031,16 +212993,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167048,14 +213008,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -167072,29 +213032,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167102,24 +213063,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167129,6 +213095,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167136,6 +213103,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167155,15 +213123,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -167174,20 +213144,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 + SolutionIndex: 1333 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -167195,16 +213165,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167212,14 +213180,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -167236,29 +213204,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167266,24 +213235,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167293,6 +213267,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167300,6 +213275,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167319,15 +213295,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -167338,20 +213316,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 + SolutionIndex: 1334 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -167359,16 +213337,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167376,14 +213352,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -167400,54 +213376,60 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167457,13 +213439,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167483,15 +213467,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -167502,20 +213488,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 + SolutionIndex: 1335 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -167523,12 +213509,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167540,7 +213524,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -167549,7 +213533,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -167566,24 +213550,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -167594,20 +213578,22 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -167615,6 +213601,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167652,15 +213639,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -167671,8 +213660,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1336 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -167681,10 +213670,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -167693,9 +213682,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167714,7 +213703,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -167733,24 +213722,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -167762,24 +213751,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167817,15 +213811,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -167836,8 +213832,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1337 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -167846,10 +213842,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -167861,8 +213857,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167900,22 +213894,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 4 LSPB: 64 - LVCA: 48 + LVCA: 64 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -167929,24 +213923,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167958,7 +213955,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -167984,15 +213981,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -168003,8 +214002,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1338 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168013,10 +214012,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -168028,7 +214027,7 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -168067,22 +214066,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 4 LSPB: 64 - LVCA: 48 + LVCA: 64 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168096,24 +214095,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168125,7 +214127,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -168151,15 +214153,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -168170,8 +214174,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1339 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168180,10 +214184,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -168195,7 +214199,7 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -168216,7 +214220,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168224,32 +214228,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168263,24 +214267,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168318,15 +214325,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -168337,8 +214346,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1340 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168347,22 +214356,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -168383,7 +214392,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168391,32 +214400,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168430,24 +214439,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168485,15 +214497,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -168504,8 +214518,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1341 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168514,22 +214528,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -168542,15 +214556,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168558,34 +214572,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -168596,27 +214610,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168654,15 +214669,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -168673,8 +214690,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1342 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168683,21 +214700,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168716,41 +214735,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168764,24 +214783,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168819,15 +214843,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -168838,8 +214864,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1343 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168848,23 +214874,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168884,36 +214908,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -168932,25 +214956,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168988,15 +215015,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -169007,8 +215036,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1344 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169017,13 +215046,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -169050,7 +215079,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -169078,9 +215107,9 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -169099,25 +215128,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169155,15 +215185,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -169174,8 +215206,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1345 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169184,11 +215216,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -169196,9 +215228,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169217,20 +215251,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -169238,16 +215272,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -169266,25 +215300,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169322,15 +215357,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -169341,8 +215378,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1346 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169351,21 +215388,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169403,24 +215442,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169432,26 +215471,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169489,15 +215531,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -169508,20 +215552,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1347 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169529,7 +215573,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -169551,7 +215595,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -169570,24 +215614,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169599,26 +215643,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169630,7 +215675,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -169656,15 +215701,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -169675,20 +215722,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1348 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169696,10 +215743,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169717,8 +215766,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -169737,24 +215786,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169766,26 +215811,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169796,7 +215842,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169823,15 +215869,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -169842,20 +215890,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1349 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169863,10 +215911,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169878,7 +215928,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -169886,42 +215936,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 LVCA: 32 - LVCB: 4 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1680 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 192 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169932,27 +215982,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169990,15 +216043,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -170009,31 +216064,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1350 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170052,41 +216107,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 - LVCB: 4 + LVCA: 16 + LVCB: 2 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -170100,24 +216155,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170155,15 +216215,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -170174,33 +216236,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1351 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170238,22 +216298,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 4 - LSPB: 64 - LVCA: 64 + LSPB: 32 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -170267,14 +216327,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -170283,8 +216345,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170296,7 +216359,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -170322,15 +216385,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -170341,28 +216406,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1352 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -170380,78 +216445,79 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170462,8 +216528,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -170489,15 +216555,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -170508,15 +216576,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1353 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -170524,17 +216592,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170547,78 +216613,79 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170629,8 +216696,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -170656,15 +216723,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -170675,15 +216744,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1354 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -170691,17 +216760,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170714,78 +216781,79 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170796,7 +216864,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -170823,15 +216891,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -170842,15 +216912,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1355 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -170858,17 +216928,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170880,15 +216948,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -170896,32 +216964,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -170934,25 +216998,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170963,8 +217032,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -170990,15 +217059,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -171009,8 +217080,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1356 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171019,23 +217090,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -171054,41 +217123,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171102,24 +217171,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171157,15 +217231,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -171176,8 +217252,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1357 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171186,23 +217262,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -171221,41 +217295,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171269,26 +217343,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171326,15 +217401,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -171345,8 +217422,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1358 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171355,21 +217432,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -171388,7 +217467,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -171407,22 +217486,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 4 + LSPA: 5 LSPB: 64 - LVCA: 64 + LVCA: 48 LVCB: 4 - LVPA: 2 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171436,26 +217515,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171467,7 +217547,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -171493,15 +217573,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -171512,8 +217594,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1359 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171522,11 +217604,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -171537,6 +217619,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -171555,7 +217639,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -171574,22 +217658,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171603,26 +217687,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171660,15 +217745,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -171679,8 +217766,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1360 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171689,11 +217776,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -171704,6 +217791,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -171722,8 +217811,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -171735,28 +217824,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 96 LVCA: 32 LVCB: 2 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171770,26 +217859,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171801,7 +217891,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -171827,15 +217917,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -171846,8 +217938,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1361 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171856,21 +217948,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -171882,7 +217976,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -171891,7 +217985,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -171908,28 +218002,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -171937,19 +218031,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -171957,6 +218053,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171968,7 +218065,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -171994,15 +218091,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -172013,31 +218112,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1362 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -172049,16 +218148,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -172075,28 +218174,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -172104,26 +218203,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172161,15 +218261,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -172180,31 +218282,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1363 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -172216,7 +218320,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172224,46 +218328,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -172271,26 +218375,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172328,15 +218435,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -172347,31 +218456,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1364 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -172383,7 +218492,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172391,73 +218500,76 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172495,15 +218607,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -172514,31 +218628,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1365 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -172550,15 +218664,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -172570,34 +218684,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -172605,24 +218719,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172634,7 +218753,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -172660,15 +218779,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -172679,33 +218800,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1366 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -172717,15 +218836,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -172737,34 +218856,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -172772,24 +218891,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172827,15 +218951,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -172846,33 +218972,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1367 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -172884,13 +219008,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -172910,24 +219034,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -172935,26 +219063,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172965,8 +219096,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -172992,15 +219123,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -173011,20 +219144,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 + SolutionIndex: 1368 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 @@ -173032,10 +219165,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173054,7 +219187,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -173073,55 +219206,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173159,15 +219293,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -173178,14 +219314,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1369 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -173199,10 +219335,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173223,14 +219361,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -173240,53 +219378,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173324,15 +219465,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -173343,14 +219486,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1370 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -173364,7 +219507,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -173381,15 +219524,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -173398,62 +219541,67 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173491,15 +219639,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -173510,33 +219660,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1371 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173557,7 +219705,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -173568,7 +219716,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -173577,52 +219725,55 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173660,15 +219811,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -173679,28 +219832,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 + SolutionIndex: 1372 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -173715,7 +219868,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -173724,45 +219877,45 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 32 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -173771,25 +219924,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173827,15 +219983,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -173846,31 +220004,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1373 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173882,15 +220040,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -173898,38 +220056,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -173937,26 +220095,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173994,15 +220153,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -174013,31 +220174,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1374 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174057,66 +220220,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -174124,6 +220289,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174161,15 +220327,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -174180,29 +220348,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1375 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -174242,53 +220410,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174326,15 +220497,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -174345,20 +220518,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1376 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -174366,8 +220539,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -174390,72 +220563,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174493,15 +220671,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -174512,15 +220692,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1377 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -174528,17 +220708,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174550,16 +220728,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -174577,52 +220755,57 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174660,15 +220843,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -174679,15 +220864,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1378 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -174700,12 +220885,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174725,8 +220908,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -174737,30 +220920,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -174772,26 +220955,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174829,15 +221015,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -174848,28 +221036,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1379 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -174891,43 +221079,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -174939,26 +221127,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174996,15 +221185,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -175015,31 +221206,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1380 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175058,43 +221251,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -175106,26 +221299,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175163,15 +221357,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -175182,31 +221378,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 + SolutionIndex: 1381 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175244,22 +221442,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -175273,14 +221471,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -175288,9 +221488,10 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175328,48 +221529,52 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1382 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -175394,14 +221599,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -175411,22 +221616,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -175440,14 +221645,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -175455,9 +221662,10 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175495,47 +221703,51 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1383 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -175559,9 +221771,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -175572,30 +221784,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -175607,26 +221819,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175638,7 +221851,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -175664,34 +221877,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1384 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -175703,11 +221920,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175719,7 +221938,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -175727,46 +221946,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175774,26 +221993,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175805,7 +222027,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -175831,50 +222053,54 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1385 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175886,7 +222112,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -175894,46 +222120,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175941,26 +222167,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175972,7 +222201,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -175998,50 +222227,54 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1386 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -176053,7 +222286,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -176061,73 +222294,76 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176165,50 +222401,54 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1387 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -176220,15 +222460,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176236,38 +222476,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176275,26 +222515,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176332,50 +222573,56 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1388 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -176387,16 +222634,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -176413,55 +222660,56 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176499,39 +222747,43 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1389 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -176539,10 +222791,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [704, 1024, 1, 128] - [102, 3019.46] @@ -178944,36 +225198,18 @@ - [101, 4291.65] - - [3136, 64, 128, 64] - [183, 8175.06] - - - [784, 512, 64, 128] - - [181, 8378.34] - - - [3136, 256, 64, 64] - - [184, 8506.65] - - - [12544, 1024, 1, 256] - - [177, 8927.93] - - [784, 128, 128, 512] - [182, 8190.53] - - [784, 512, 256, 128] - [180, 8637.14] - - - [3136, 64, 64, 256] - - [179, 8782.93] - - - [3136, 512, 1, 2048] - - [176, 7298.32] - - - [12544, 256, 1, 1024] - - [188, 7667.25] - - - [3136, 2048, 1, 512] - - [187, 8447.22] - - [3136, 256, 256, 64] - [180, 8663.08] - - [3136, 64, 128, 256] - [178, 8943.46] - - - [784, 128, 64, 512] - - [186, 8006.27] - - [3136, 64, 256, 64] - [183, 8267.12] - - [784, 512, 128, 128] - [180, 8564.25] - - - [3136, 64, 64, 64] - - [183, 8009.35] - - [784, 128, 256, 512] - [184, 8377.06] - - [3136, 64, 256, 256] @@ -179022,8 +225258,6 @@ - [212, 8995.84] - - [4096, 512, 1, 2048] - [207, 9298.08] - - - [512, 256, 1, 2048] - - [200, 5186.16] - - [4096, 1024, 1, 2048] - [189, 9790.67] - - [2048, 1024, 1, 2048] @@ -179060,8 +225294,6 @@ - [213, 4032.98] - - [1024, 256, 1, 4096] - [201, 7326.3] - - - [4096, 512, 1, 4096] - - [193, 9471.97] - - [1024, 200, 1, 2048] - [194, 5530.46] - - [2048, 1024, 1, 512] @@ -182512,5972 +228744,6954 @@ - [524, 10427.3] - - [1024, 1, 1, 13] - [537, 0.0] + - - [768, 512, 1, 768] + - [561, 5889.04] + - - [768, 2048, 1, 3072] + - [571, 9394.62] + - - [768, 32, 1, 768] + - [583, 1502.74] + - - [64, 128, 96, 128] + - [578, 4973.48] + - - [3072, 1024, 1, 768] + - [572, 9856.07] + - - [768, 1024, 1, 3072] + - [565, 8611.06] + - - [768, 512, 1, 3072] + - [564, 6430.79] + - - [768, 64, 1, 768] + - [585, 2621.44] + - - [768, 4096, 1, 3072] + - [570, 10030.4] + - - [768, 2048, 1, 2] + - [563, 381.763] + - - [768, 2048, 1, 768] + - [568, 9754.2] + - - [768, 320, 1, 30522] + - [581, 8529.4] + - - [64, 64, 96, 64] + - [575, 2496.61] + - - [768, 640, 1, 30522] + - [562, 8253.84] + - - [768, 1280, 1, 30522] + - [567, 9572.85] + - - [768, 1280, 1, 768] + - [571, 8713.93] + - - [768, 640, 1, 768] + - [561, 7293.03] + - - [768, 32, 1, 2] + - [573, 11.8154] + - - [3072, 2048, 1, 768] + - [568, 10019.6] + - - [768, 4096, 1, 768] + - [568, 9927.35] + - - [3072, 4096, 1, 768] + - [571, 10150.1] + - - [64, 256, 192, 256] + - [577, 7054.19] + - - [768, 8, 1, 768] + - [584, 340.939] + - - [64, 128, 384, 128] + - [576, 6765.01] + - - [768, 1024, 1, 768] + - [566, 8768.58] + - - [768, 320, 1, 768] + - [582, 6838.54] + - - [64, 64, 768, 64] + - [579, 5388.83] + - - [768, 1024, 1, 2] + - [559, 258.695] + - - [768, 16, 1, 768] + - [584, 819.2] + - - [64, 256, 96, 256] + - [577, 5893.64] + - - [3072, 512, 1, 768] + - [569, 9722.79] + - - [768, 160, 1, 768] + - [586, 5019.78] + - - [768, 4096, 1, 2] + - [560, 507.375] + - - [1600, 512, 1, 1024] + - [590, 7186.95] + - - [1024, 512, 1, 64] + - [588, 2557.5] + - - [1024, 512, 1, 1] + - [587, 71.2348] + - - [2048, 512, 1, 1] + - [589, 90.3945] + - - [1024, 200, 1, 1] + - [595, 40.0] + - - [32, 200, 1, 1] + - [591, 1.56863] + - - [560, 200, 1, 1024] + - [599, 4731.35] + - - [1, 512, 1, 1] + - [598, 0.130612] + - - [64, 512, 1, 1] + - [593, 7.58519] + - - [1024, 8192, 1, 256] + - [608, 9518.99] + - - [1024, 22016, 1, 256] + - [614, 9881.12] + - - [256, 8976, 1, 4352] + - [606, 9567.08] + - - [512, 256, 1, 2048] + - [619, 5917.89] + - - [1024, 19968, 1, 256] + - [614, 9882.37] + - - [256, 8976, 1, 1536] + - [604, 8437.35] + - - [256, 8976, 1, 33536] + - [604, 8441.89] + - - [1024, 1792, 1, 256] + - [604, 7756.97] + - - [1024, 21504, 1, 256] + - [614, 9893.9] + - - [512, 215, 1, 2048] + - [620, 4665.64] + - - [1024, 7168, 1, 256] + - [608, 9509.35] + - - [256, 8976, 1, 15872] + - [610, 8914.65] + - - [1024, 19712, 1, 256] + - [614, 9771.9] + - - [256, 8976, 1, 5632] + - [610, 8740.03] + - - [1024, 14848, 1, 256] + - [614, 9756.15] + - - [1024, 28672, 1, 256] + - [614, 9958.92] + - - [256, 8976, 1, 9728] + - [617, 8853.04] + - - [1024, 17152, 1, 256] + - [608, 9737.3] + - - [256, 8976, 1, 11520] + - [610, 8999.2] + - - [256, 8976, 1, 8192] + - [600, 7897.32] + - - [1024, 3328, 1, 256] + - [615, 8593.53] + - - [256, 8976, 1, 7424] + - [610, 8980.47] + - - [1024, 18944, 1, 256] + - [614, 9854.85] + - - [1024, 10496, 1, 256] + - [609, 9453.9] + - - [256, 8976, 1, 5376] + - [607, 9608.37] + - - [256, 8976, 1, 6144] + - [604, 7880.13] + - - [1024, 40448, 1, 256] + - [614, 10016.6] + - - [256, 8976, 1, 22016] + - [617, 8939.87] + - - [256, 8976, 1, 4864] + - [605, 9211.43] + - - [256, 8976, 1, 12288] + - [601, 8065.05] + - - [1024, 9728, 1, 256] + - [614, 9636.25] + - - [256, 8976, 1, 2048] + - [602, 7001.33] + - - [1024, 10240, 1, 256] + - [608, 9619.96] + - - [256, 8976, 1, 2304] + - [606, 9509.74] + - - [1024, 7936, 1, 256] + - [614, 9300.67] + - - [768, 256, 1, 2048] + - [618, 6267.95] + - - [1024, 9984, 1, 256] + - [614, 9477.28] + - - [1024, 13312, 1, 256] + - [614, 9758.56] + - - [1024, 16128, 1, 256] + - [608, 9721.9] + - - [1024, 8960, 1, 256] + - [609, 9398.25] + - - [1024, 5120, 1, 256] + - [615, 9315.5] + - - [1024, 11264, 1, 256] + - [608, 9664.8] + - - [256, 8976, 1, 20480] + - [616, 8279.87] + - - [1024, 20992, 1, 256] + - [608, 9878.87] + - - [256, 8976, 1, 9472] + - [610, 8990.96] + - - [256, 8976, 1, 8448] + - [610, 8983.52] + - - [256, 8976, 1, 20992] + - [611, 8942.11] + - - [256, 8976, 1, 10496] + - [611, 8989.71] + - - [1024, 15104, 1, 256] + - [609, 9676.01] + - - [1024, 6400, 1, 256] + - [617, 9145.89] + - - [1024, 4096, 1, 256] + - [610, 9124.25] + - - [256, 8976, 1, 2560] + - [604, 8566.11] + - - [256, 8976, 1, 2816] + - [606, 9496.84] + - - [1024, 7680, 1, 256] + - [614, 9460.84] + - - [256, 8976, 1, 14336] + - [611, 8226.8] + - - [256, 8976, 1, 6656] + - [611, 8771.42] + - - [1024, 3072, 1, 256] + - [611, 9076.94] + - - [256, 8976, 1, 5888] + - [607, 9546.3] + - - [1024, 12288, 1, 256] + - [608, 9690.81] + - - [256, 8976, 1, 26112] + - [613, 8699.83] + - - [1024, 7424, 1, 256] + - [615, 9256.84] + - - [256, 8976, 1, 14848] + - [616, 8885.79] + - - [768, 215, 1, 2048] + - [618, 5628.59] + - - [1024, 2560, 1, 256] + - [611, 8820.83] + - - [256, 8976, 1, 19968] + - [610, 8928.86] + - - [256, 8976, 1, 9984] + - [610, 8993.12] + - - [1024, 4864, 1, 256] + - [611, 8974.3] + - - [1024, 33536, 1, 256] + - [614, 9943.07] + - - [256, 8976, 1, 15104] + - [611, 8996.63] + - - [1024, 2048, 1, 256] + - [609, 8462.66] + - - [256, 8976, 1, 8960] + - [611, 8998.92] + - - [1024, 6144, 1, 256] + - [616, 9359.67] + - - [1024, 14592, 1, 256] + - [614, 9667.42] + - - [256, 8976, 1, 19712] + - [610, 9020.11] + - - [1024, 11520, 1, 256] + - [609, 9527.7] + - - [1024, 5632, 1, 256] + - [608, 9297.2] + - - [256, 8976, 1, 11008] + - [617, 8994.8] + - - [256, 8976, 1, 17152] + - [611, 9003.8] + - - [256, 8976, 1, 3072] + - [600, 8261.96] + - - [1024, 3840, 1, 256] + - [617, 8671.89] + - - [1024, 14336, 1, 256] + - [614, 9760.28] + - - [1024, 20480, 1, 256] + - [608, 9887.85] + - - [1024, 23552, 1, 256] + - [608, 9890.46] + - - [256, 8976, 1, 7168] + - [603, 8478.34] + - - [1024, 13568, 1, 256] + - [608, 9654.64] + - - [1024, 4608, 1, 256] + - [616, 9218.25] + - - [256, 8976, 1, 10240] + - [601, 8076.16] + - - [1024, 8704, 1, 256] + - [610, 9475.5] + - - [1024, 11008, 1, 256] + - [614, 9524.96] + - - [1024, 8448, 1, 256] + - [608, 9352.16] + - - [256, 8976, 1, 44505] + - [612, 8430.23] + - - [6272, 256, 1, 528] + - [664, 7389.94] + - - [3136, 2048, 1, 1024] + - [645, 9657.94] + - - [6272, 112, 1, 512] + - [643, 5931.09] + - - [2048, 320, 1, 1280] + - [663, 7772.99] + - - [289, 256, 1, 1568] + - [684, 3718.17] + - - [3136, 64, 64, 64] + - [623, 8201.15] + - - [50176, 128, 1, 256] + - [646, 8908.58] + - - [5329, 64, 1, 448] + - [629, 4602.2] + - - [289, 192, 1, 1344] + - [681, 3452.59] + - - [12544, 1024, 1, 256] + - [646, 9742.64] + - - [784, 64, 32, 192] + - [622, 6844.61] + - - [6272, 64, 1, 480] + - [630, 5562.24] + - - [196, 128, 1, 800] + - [672, 1639.74] + - - [64, 512, 1, 1344] + - [671, 2313.04] + - - [6272, 64, 1, 512] + - [629, 5609.19] + - - [6272, 160, 1, 528] + - [630, 6149.7] + - - [289, 160, 32, 768] + - [657, 6637.82] + - - [12544, 256, 1, 1024] + - [664, 8790.46] + - - [289, 224, 1, 1568] + - [684, 3270.17] + - - [5329, 64, 32, 160] + - [637, 9091.04] + - - [5329, 96, 1, 576] + - [664, 5555.66] + - - [3025, 64, 1, 363] + - [682, 4392.3] + - - [784, 32, 32, 192] + - [653, 5633.8] + - - [3136, 512, 1, 1024] + - [649, 7553.14] + - - [6272, 16, 1, 480] + - [684, 3219.85] + - - [1225, 64, 32, 288] + - [644, 8240.58] + - - [64, 256, 1, 1536] + - [677, 1456.36] + - - [289, 192, 32, 768] + - [656, 7372.8] + - - [2048, 448, 1, 1280] + - [639, 8403.01] + - - [3136, 2048, 1, 512] + - [638, 9486.31] + - - [289, 256, 1, 2016] + - [684, 3876.08] + - - [289, 384, 32, 1024] + - [623, 7350.54] + - - [1568, 32, 1, 832] + - [673, 2717.87] + - - [3136, 64, 32, 64] + - [626, 7657.26] + - - [289, 160, 1, 1120] + - [680, 2826.9] + - - [6272, 128, 1, 528] + - [634, 6926.26] + - - [21609, 32, 1, 288] + - [635, 3698.9] + - - [1225, 192, 1, 1728] + - [668, 7309.81] + - - [4096, 512, 1, 4096] + - [651, 10272.1] + - - [64, 256, 1, 1152] + - [677, 1387.82] + - - [6272, 96, 1, 480] + - [665, 6371.56] + - - [784, 96, 1, 800] + - [685, 3330.27] + - - [2048, 448, 1, 2048] + - [639, 8622.65] + - - [784, 96, 32, 192] + - [654, 7092.36] + - - [3136, 64, 64, 256] + - [647, 9579.16] + - - [289, 224, 1, 1344] + - [684, 3180.01] + - - [1001, 512, 1, 4096] + - [625, 8195.07] + - - [2048, 192, 1, 1280] + - [630, 6120.09] + - - [1225, 64, 32, 256] + - [635, 8076.62] + - - [2048, 256, 1, 1536] + - [625, 8137.7] + - - [1225, 64, 1, 1200] + - [684, 3552.87] + - - [6272, 128, 1, 512] + - [638, 6878.21] + - - [729, 192, 1, 1600] + - [683, 5016.77] + - - [289, 192, 1, 896] + - [681, 3091.87] + - - [1568, 384, 1, 832] + - [664, 6934.62] + - - [784, 16, 32, 192] + - [655, 3380.28] + - - [1568, 256, 1, 832] + - [629, 5980.86] + - - [1568, 48, 1, 832] + - [686, 3275.09] + - - [1568, 192, 1, 832] + - [624, 4441.11] + - - [289, 192, 32, 1024] + - [627, 6563.06] + - - [6272, 32, 1, 528] + - [668, 4998.67] + - - [49, 128, 1, 1200] + - [669, 550.175] + - - [1225, 64, 32, 384] + - [641, 8589.33] + - - [289, 128, 1, 896] + - [680, 2103.1] + - - [1568, 160, 1, 832] + - [668, 6995.05] + - - [1001, 32, 1, 1024] + - [677, 1744.72] + - - [2048, 320, 1, 2048] + - [662, 7118.04] + - - [2048, 384, 1, 1536] + - [625, 8184.01] + - - [50176, 512, 1, 256] + - [637, 9852.4] + - - [289, 256, 1, 1792] + - [686, 3809.75] + - - [64, 448, 1, 1152] + - [678, 2128.23] + - - [5041, 96, 1, 576] + - [663, 5279.3] + - - [6272, 192, 1, 480] + - [625, 7479.65] + - - [784, 32, 32, 256] + - [652, 5708.91] + - - [1001, 32, 1, 2048] + - [679, 2141.04] + - - [289, 192, 1, 1120] + - [675, 3277.77] + - - [6272, 32, 1, 512] + - [667, 4978.7] + - - [289, 384, 1, 3456] + - [684, 5904.14] + - - [289, 384, 1, 2592] + - [685, 5707.34] + - - [784, 128, 64, 512] + - [631, 8864.39] + - - [12544, 1024, 1, 512] + - [646, 10008.3] + - - [12544, 256, 1, 512] + - [664, 8628.08] + - - [6272, 24, 1, 512] + - [668, 3568.07] + - - [5041, 192, 1, 720] + - [639, 8424.42] + - - [64, 320, 1, 1728] + - [672, 1469.66] + - - [784, 128, 32, 256] + - [640, 8104.14] + - - [289, 96, 1, 864] + - [678, 1838.25] + - - [1225, 32, 32, 192] + - [659, 5949.72] + - - [1568, 128, 1, 832] + - [667, 5718.69] + - - [289, 128, 32, 768] + - [625, 7289.25] + - - [3136, 256, 64, 64] + - [633, 9103.92] + - - [196, 64, 1, 800] + - [671, 915.62] + - - [4096, 512, 1, 9216] + - [648, 10351.4] + - - [12544, 64, 1, 147] + - [638, 5069.33] + - - [784, 32, 1, 400] + - [669, 1140.36] + - - [6272, 160, 1, 512] + - [629, 6140.08] + - - [1225, 48, 32, 288] + - [635, 5978.61] + - - [64, 320, 1, 2880] + - [676, 1920.0] + - - [1225, 64, 32, 192] + - [629, 7641.01] + - - [1001, 32, 1, 1536] + - [677, 2084.79] + - - [784, 64, 32, 256] + - [621, 6990.51] + - - [64, 384, 1, 1152] + - [678, 1862.6] + - - [784, 512, 64, 128] + - [632, 9025.95] + - - [3136, 512, 1, 2048] + - [650, 7764.3] + - - [6272, 144, 1, 512] + - [625, 5574.04] + - - [1225, 192, 32, 384] + - [639, 9373.83] + - - [64, 192, 1, 1728] + - [677, 1206.46] + - - [8192, 320, 1, 1280] + - [691, 9875.92] + - - [8192, 320, 1, 2048] + - [694, 9745.7] + - - [8192, 384, 1, 1280] + - [691, 10046.2] + - - [8192, 192, 1, 1280] + - [694, 9950.9] + - - [8192, 192, 1, 2048] + - [690, 9559.67] + - - [8192, 384, 1, 2048] + - [692, 9945.74] + - - [8192, 448, 1, 2048] + - [693, 9908.51] + - - [1001, 64, 1, 1536] + - [687, 3649.94] + - - [8192, 448, 1, 1280] + - [691, 9981.35] + - - [1001, 64, 1, 2048] + - [688, 3580.87] + - - [1001, 128, 1, 2048] + - [689, 5587.87] - - [704, 1024, 1, 128] - - [661, 3019.56] + - [797, 3019.56] - - [1024, 1024, 1, 3328] - - [699, 8162.65] + - [835, 8162.65] - - [4, 704, 1, 1280] - - [602, 319.646] + - [738, 319.646] - - [4, 1856, 1, 3328] - - [632, 550.614] + - [768, 550.614] - - [1856, 448, 1, 3328] - - [684, 6813.15] + - [820, 6813.15] - - [2944, 4288, 1, 1280] - - [693, 8975.86] + - [829, 8975.86] - - [2368, 64, 1, 3328] - - [607, 5482.33] + - [743, 5482.33] - - [1760, 32, 1, 1760] - - [646, 3860.04] + - [782, 3860.04] - - [2368, 5888, 1, 256] - - [690, 8656.83] + - [826, 8656.83] - - [5888, 1856, 1, 256] - - [680, 7881.53] + - [816, 7881.53] - - [64, 3584, 1, 1280] - - [616, 4835.43] + - [752, 4835.43] - - [512, 24000, 1, 1536] - - [687, 8666.0] + - [823, 8666.0] - - [128, 6784, 1, 3328] - - [684, 7062.35] + - [820, 7062.35] - - [5888, 1408, 1, 256] - - [697, 8130.32] + - [833, 8130.32] - - [5888, 1856, 1, 3328] - - [687, 8840.85] + - [823, 8840.85] - - [512, 4, 1, 512] - - [572, 170.323] + - [708, 170.323] - - [35, 1500, 1, 2560] - - [576, 2896.65] + - [712, 2896.65] - - [1856, 4288, 1, 256] - - [676, 8374.73] + - [812, 8374.73] - - [1024, 5056, 1, 128] - - [673, 3304.35] + - [809, 3304.35] - - [5056, 5056, 1, 3328] - - [687, 8905.53] + - [823, 8905.53] - - [1408, 5888, 1, 1280] - - [687, 9418.2] + - [823, 9418.2] - - [2368, 448, 1, 128] - - [661, 3075.07] + - [797, 3075.07] - - [6144, 6000, 1, 2560] - - [687, 9336.43] + - [823, 9336.43] - - [2368, 6784, 1, 128] - - [660, 4919.36] + - [796, 4919.36] - - [1024, 3584, 1, 3328] - - [678, 8071.17] + - [814, 8071.17] - - [512, 48000, 1, 2048] - - [687, 8763.16] + - [823, 8763.16] - - [1408, 64, 1, 128] - - [583, 805.57] + - [719, 805.57] - - [256, 4288, 1, 3328] - - [709, 6331.96] + - [845, 6331.96] - - [5888, 1408, 1, 1280] - - [677, 9226.27] + - [813, 9226.27] - - [704, 1856, 1, 3328] - - [703, 6309.5] + - [839, 6309.5] - - [1408, 4288, 1, 256] - - [687, 8374.6] + - [823, 8374.6] - - [1024, 2368, 1, 256] - - [684, 7341.12] + - [820, 7341.12] - - [64, 4, 1, 256] - - [627, 13.1032] + - [763, 13.1032] - - [1408, 1856, 1, 1280] - - [694, 8773.05] + - [830, 8773.05] - - [1408, 64, 1, 1280] - - [640, 4050.08] + - [776, 4050.08] - - [448, 1024, 1, 1280] - - [703, 6071.26] + - [839, 6071.26] - - [4096, 32, 1, 4096] - - [637, 5491.82] + - [773, 5491.82] - - [256, 1408, 1, 3328] - - [689, 5351.49] + - [825, 5351.49] - - [5056, 5056, 1, 1280] - - [697, 9408.67] + - [833, 9408.67] - - [448, 5056, 1, 256] - - [702, 6680.54] + - [838, 6680.54] - - [704, 1856, 1, 1280] - - [679, 7504.03] + - [815, 7504.03] - - [128, 5056, 1, 128] - - [594, 2316.58] + - [730, 2316.58] - - [2368, 128, 1, 256] - - [679, 3660.22] + - [815, 3660.22] - - [1856, 1408, 1, 128] - - [666, 3885.97] + - [802, 3885.97] - - [64, 5056, 1, 256] - - [689, 3318.91] + - [825, 3318.91] - - [6784, 256, 1, 3328] - - [687, 7590.64] + - [823, 7590.64] - - [1408, 3584, 1, 256] - - [676, 8276.92] + - [812, 8276.92] - - [4288, 448, 1, 256] - - [689, 7139.79] + - [825, 7139.79] - - [64, 704, 1, 128] - - [590, 375.567] + - [726, 375.567] - - [1024, 1856, 1, 128] - - [659, 2890.66] + - [795, 2890.66] - - [4288, 2944, 1, 1280] - - [693, 8981.45] + - [829, 8981.45] - - [704, 5056, 1, 1280] - - [679, 7684.72] + - [815, 7684.72] - - [2368, 704, 1, 3328] - - [694, 7070.14] + - [830, 7070.14] - - [256, 5888, 1, 256] - - [679, 7319.45] + - [815, 7319.45] - - [1856, 4288, 1, 3328] - - [677, 9238.69] + - [813, 9238.69] - - [256, 2944, 1, 256] - - [679, 6090.31] + - [815, 6090.31] - - [5888, 1024, 1, 256] - - [683, 8270.05] + - [819, 8270.05] - - [448, 64, 1, 1280] - - [636, 2493.32] + - [772, 2493.32] - - [3072, 64, 1, 1024] - - [619, 3149.77] + - [755, 3149.77] - - [3584, 4, 1, 1280] - - [721, 567.862] + - [857, 567.862] - - [2560, 16, 1, 2560] - - [628, 2887.15] + - [764, 2887.15] - - [2944, 64, 1, 256] - - [619, 2565.76] + - [755, 2565.76] - - [128, 4, 1, 1280] - - [722, 78.8692] + - [858, 78.8692] - - [1408, 2944, 1, 256] - - [683, 8337.3] + - [819, 8337.3] - - [256, 1856, 1, 1280] - - [709, 6267.35] + - [845, 6267.35] - - [6784, 5056, 1, 3328] - - [693, 9424.0] + - [829, 9424.0] - - [5056, 5056, 1, 256] - - [680, 8758.33] + - [816, 8758.33] - - [128, 256, 1, 256] - - [635, 1205.36] + - [771, 1205.36] - - [64, 1024, 1, 1280] - - [646, 3566.68] + - [782, 3566.68] - - [2944, 4, 1, 256] - - [599, 319.449] + - [735, 319.449] - - [704, 5056, 1, 128] - - [668, 4073.83] + - [804, 4073.83] - - [4, 2368, 1, 1280] - - [627, 496.992] + - [763, 496.992] - - [2368, 2944, 1, 1280] - - [676, 9085.55] + - [812, 9085.55] - - [448, 448, 1, 3328] - - [654, 5428.76] + - [790, 5428.76] - - [6784, 6784, 1, 1280] - - [693, 8727.03] + - [829, 8727.03] - - [1024, 256, 1, 3328] - - [703, 5499.42] + - [839, 5499.42] - - [1408, 4288, 1, 1280] - - [677, 9094.42] + - [813, 9094.42] - - [3584, 4288, 1, 1280] - - [680, 8703.88] + - [816, 8703.88] - - [512, 6000, 1, 2560] - - [683, 8474.56] + - [819, 8474.56] - - [2368, 704, 1, 1280] - - [689, 7651.59] + - [825, 7651.59] - - [5056, 4288, 1, 3328] - - [697, 8545.35] + - [833, 8545.35] - - [3584, 2368, 1, 3328] - - [685, 8797.88] + - [821, 8797.88] - - [5888, 6784, 1, 1280] - - [683, 8785.18] + - [819, 8785.18] - - [64, 704, 1, 1280] - - [606, 2783.48] + - [742, 2783.48] - - [4288, 256, 1, 256] - - [679, 6162.78] + - [815, 6162.78] - - [2944, 128, 1, 128] - - [581, 1951.33] + - [717, 1951.33] - - [6144, 32, 1, 2560] - - [640, 4589.05] + - [776, 4589.05] - - [6784, 448, 1, 1280] - - [684, 8674.31] + - [820, 8674.31] - - [2944, 5888, 1, 256] - - [697, 8991.76] + - [833, 8991.76] - - [64, 64, 1, 1280] - - [657, 712.448] + - [793, 712.448] - - [4288, 2944, 1, 256] - - [693, 8678.14] + - [829, 8678.14] - - [5888, 704, 1, 1280] - - [683, 8652.71] + - [819, 8652.71] - - [5056, 4, 1, 3328] - - [599, 650.772] + - [735, 650.772] - - [1856, 64, 1, 1280] - - [616, 4471.97] + - [752, 4471.97] - - [1760, 16, 1, 1760] - - [656, 2592.23] + - [792, 2592.23] - - [448, 5888, 1, 128] - - [666, 3823.03] + - [802, 3823.03] - - [5888, 64, 1, 3328] - - [648, 6013.22] + - [784, 6013.22] - - [2944, 256, 1, 3328] - - [689, 7791.45] + - [825, 7791.45] - - [1024, 64, 1, 128] - - [590, 592.516] + - [726, 592.516] - - [5056, 2368, 1, 1280] - - [676, 9260.53] + - [812, 9260.53] - - [448, 3584, 1, 1280] - - [697, 6771.34] + - [833, 6771.34] - - [6784, 5888, 1, 256] - - [691, 7933.39] + - [827, 7933.39] - - [64, 1024, 1, 3328] - - [640, 4783.08] + - [776, 4783.08] - - [704, 128, 1, 1280] - - [646, 3971.98] + - [782, 3971.98] - - [4, 3584, 1, 128] - - [715, 59.5238] + - [851, 59.5238] - - [1408, 448, 1, 1280] - - [689, 5902.17] + - [825, 5902.17] - - [1024, 1408, 1, 256] - - [684, 5272.94] + - [820, 5272.94] - - [2368, 2368, 1, 3328] - - [689, 8488.76] + - [825, 8488.76] - - [1856, 6784, 1, 128] - - [666, 4742.51] + - [802, 4742.51] - - [5056, 704, 1, 3328] - - [692, 7772.48] + - [828, 7772.48] - - [1408, 1856, 1, 256] - - [710, 5229.84] + - [846, 5229.84] - - [1408, 704, 1, 3328] - - [710, 6954.93] + - [846, 6954.93] - - [2368, 5056, 1, 256] - - [683, 8580.68] + - [819, 8580.68] - - [1408, 256, 1, 1280] - - [709, 4790.11] + - [845, 4790.11] - - [3072, 128, 1, 1024] - - [705, 4579.87] + - [841, 4579.87] - - [3584, 2368, 1, 1280] - - [676, 8675.13] + - [812, 8675.13] - - [4288, 64, 1, 3328] - - [655, 5550.11] + - [791, 5550.11] - - [2368, 4, 1, 1280] - - [721, 537.518] + - [857, 537.518] - - [704, 5888, 1, 256] - - [677, 5305.88] + - [813, 5305.88] - - [6784, 2944, 1, 128] - - [673, 4344.21] + - [809, 4344.21] - - [6784, 64, 1, 256] - - [703, 4496.42] + - [839, 4496.42] - - [2944, 256, 1, 256] - - [689, 6553.7] + - [825, 6553.7] - - [2944, 6784, 1, 3328] - - [677, 8895.76] + - [813, 8895.76] - - [128, 1, 1, 1408] - - [657, 25.7] + - [793, 25.7] - - [704, 1408, 1, 3328] - - [691, 7913.21] + - [827, 7913.21] - - [3584, 704, 1, 3328] - - [676, 7526.43] + - [812, 7526.43] - - [2944, 256, 1, 128] - - [660, 2830.76] + - [796, 2830.76] - - [6784, 4, 1, 1280] - - [717, 645.235] + - [853, 645.235] - - [1024, 64, 1, 1280] - - [615, 3013.25] + - [751, 3013.25] - - [8448, 4, 1, 2816] - - [567, 984.768] + - [703, 984.768] - - [448, 4288, 1, 256] - - [689, 7139.79] + - [825, 7139.79] - - [64, 3584, 1, 3328] - - [613, 5683.27] + - [749, 5683.27] - - [704, 2368, 1, 1280] - - [697, 7045.3] + - [833, 7045.3] - - [1856, 2368, 1, 1280] - - [694, 8327.9] + - [830, 8327.9] - - [2368, 128, 1, 3328] - - [630, 6082.65] + - [766, 6082.65] - - [64, 193600, 1, 64] - - [679, 6747.77] + - [815, 6747.77] - - [1760, 128, 1, 1760] - - [607, 5513.07] + - [743, 5513.07] - - [448, 1408, 1, 256] - - [689, 5591.54] + - [825, 5591.54] - - [1856, 4288, 1, 1280] - - [687, 8647.72] + - [823, 8647.72] - - [64, 5056, 1, 3328] - - [647, 6096.59] + - [783, 6096.59] - - [512, 1500, 1, 2816] - - [689, 7879.3] + - [825, 7879.3] - - [1024, 448, 1, 128] - - [661, 1844.33] + - [797, 1844.33] - - [704, 4, 1, 1280] - - [627, 341.433] + - [763, 341.433] - - [704, 256, 1, 128] - - [661, 1001.34] + - [797, 1001.34] - - [256, 193600, 1, 64] - - [697, 8113.3] + - [833, 8113.3] - - [704, 2944, 1, 128] - - [668, 3747.13] + - [804, 3747.13] - - [1408, 1024, 1, 1280] - - [694, 7080.71] + - [830, 7080.71] - - [704, 6784, 1, 256] - - [712, 6630.47] + - [848, 6630.47] - - [6784, 704, 1, 256] - - [679, 8005.86] + - [815, 8005.86] - - [5056, 1408, 1, 128] - - [670, 4303.13] + - [806, 4303.13] - - [2048, 7000, 1, 2048] - - [687, 9269.2] + - [823, 9269.2] - - [256, 3584, 1, 3328] - - [681, 7334.48] + - [817, 7334.48] - - [5056, 704, 1, 256] - - [689, 7954.12] + - [825, 7954.12] - - [128, 1408, 1, 128] - - [584, 1243.02] + - [720, 1243.02] - - [3584, 4288, 1, 3328] - - [713, 7683.81] + - [849, 7683.81] - - [5888, 1856, 1, 1280] - - [677, 8831.34] + - [813, 8831.34] - - [256, 1408, 1, 256] - - [679, 4352.68] + - [815, 4352.68] - - [5056, 64, 1, 1280] - - [646, 5012.05] + - [782, 5012.05] - - [1024, 704, 1, 256] - - [679, 5710.17] + - [815, 5710.17] - - [64, 256, 1, 128] - - [585, 149.897] + - [721, 149.897] - - [2368, 3584, 1, 1280] - - [687, 8609.68] + - [823, 8609.68] - - [1024, 256, 1, 256] - - [703, 3276.9] + - [839, 3276.9] - - [1856, 4, 1, 1280] - - [601, 497.104] + - [737, 497.104] - - [448, 448, 1, 256] - - [689, 3117.83] + - [825, 3117.83] - - [2944, 3584, 1, 3328] - - [677, 8879.45] + - [813, 8879.45] - - [7680, 32, 1, 2560] - - [647, 5310.24] + - [783, 5310.24] - - [128, 4288, 1, 128] - - [587, 2116.2] + - [723, 2116.2] - - [256, 256, 1, 3328] - - [640, 4774.7] + - [776, 4774.7] - - [128, 1024, 1, 3328] - - [641, 5894.8] + - [777, 5894.8] - - [4, 1408, 1, 3328] - - [632, 552.674] + - [768, 552.674] - - [196, 256, 64, 1024] - - [730, 5218.34] + - [866, 5218.34] - - [6784, 2944, 1, 256] - - [695, 8271.18] + - [831, 8271.18] - - [64, 1856, 1, 1280] - - [646, 4167.96] + - [782, 4167.96] - - [64, 1024, 1, 128] - - [580, 589.188] + - [716, 589.188] - - [1024, 1500, 1, 2560] - - [684, 8407.88] + - [820, 8407.88] - - [1856, 2368, 1, 256] - - [679, 8092.15] + - [815, 8092.15] - - [3584, 256, 1, 128] - - [662, 2607.57] + - [798, 2607.57] - - [3584, 6784, 1, 3328] - - [696, 8558.83] + - [832, 8558.83] - - [256, 1024, 1, 256] - - [689, 3901.78] + - [825, 3901.78] - - [4, 6784, 1, 3328] - - [627, 662.575] + - [763, 662.575] - - [1024, 5888, 1, 3328] - - [687, 9161.76] + - [823, 9161.76] - - [1024, 128, 1, 1280] - - [644, 3942.12] + - [780, 3942.12] - - [3072, 32, 1, 1024] - - [621, 2840.49] + - [757, 2840.49] - - [6144, 24000, 1, 2560] - - [677, 7605.87] + - [813, 7605.87] - - [448, 1024, 1, 256] - - [679, 5062.19] + - [815, 5062.19] - - [5056, 4288, 1, 1280] - - [687, 9090.99] + - [823, 9090.99] - - [5888, 64, 1, 256] - - [689, 4449.78] + - [825, 4449.78] - - [1856, 256, 1, 1280] - - [703, 5834.46] + - [839, 5834.46] - - [64, 5888, 1, 3328] - - [641, 6152.44] + - [777, 6152.44] - - [2368, 2368, 1, 1280] - - [681, 8594.66] + - [817, 8594.66] - - [2944, 5888, 1, 128] - - [666, 4776.19] + - [802, 4776.19] - - [704, 5888, 1, 1280] - - [681, 8435.91] + - [817, 8435.91] - - [2368, 3584, 1, 128] - - [663, 4590.71] + - [799, 4590.71] - - [1856, 5056, 1, 128] - - [674, 4503.48] + - [810, 4503.48] - - [4608, 1, 1, 1536] - - [572, 226.955] + - [708, 226.955] - - [448, 256, 1, 3328] - - [616, 5415.56] + - [752, 5415.56] - - [2944, 6784, 1, 1280] - - [700, 8385.11] + - [836, 8385.11] - - [448, 1856, 1, 128] - - [670, 2618.96] + - [806, 2618.96] - - [128, 1024, 1, 128] - - [579, 940.527] + - [715, 940.527] - - [7680, 4, 1, 2560] - - [603, 985.104] + - [739, 985.104] - - [1024, 704, 1, 1280] - - [689, 7204.56] + - [825, 7204.56] - - [128, 5888, 1, 256] - - [679, 6313.52] + - [815, 6313.52] - - [1024, 5056, 1, 1280] - - [684, 8979.76] + - [820, 8979.76] - - [4288, 1024, 1, 256] - - [676, 7198.29] + - [812, 7198.29] - - [2944, 2368, 1, 128] - - [661, 4624.57] + - [797, 4624.57] - - [704, 704, 1, 3328] - - [702, 5870.71] + - [838, 5870.71] - - [704, 1408, 1, 1280] - - [691, 7680.32] + - [827, 7680.32] - - [5888, 448, 1, 1280] - - [679, 7718.66] + - [815, 7718.66] - - [3584, 256, 1, 3328] - - [684, 7523.88] + - [820, 7523.88] - - [704, 5888, 1, 3328] - - [689, 8196.99] + - [825, 8196.99] - - [704, 1856, 1, 128] - - [667, 3388.43] + - [803, 3388.43] - - [128, 3584, 1, 3328] - - [641, 6626.5] + - [777, 6626.5] - - [4, 4288, 1, 128] - - [714, 159.648] + - [850, 159.648] - - [128, 704, 1, 1280] - - [604, 4038.73] + - [740, 4038.73] - - [3584, 2944, 1, 256] - - [677, 7685.99] + - [813, 7685.99] - - [1856, 128, 1, 3328] - - [633, 6070.63] + - [769, 6070.63] - - [1856, 2368, 1, 3328] - - [694, 8460.62] + - [830, 8460.62] - - [512, 6000, 1, 2816] - - [697, 9019.55] + - [833, 9019.55] - - [2944, 448, 1, 128] - - [660, 3027.73] + - [796, 3027.73] - - [64, 193600, 1, 256] - - [703, 7080.32] + - [839, 7080.32] - - [128, 2944, 1, 1280] - - [679, 5397.87] + - [815, 5397.87] - - [448, 2944, 1, 1280] - - [689, 6996.97] + - [825, 6996.97] - - [512, 24000, 1, 2048] - - [697, 8832.67] + - [833, 8832.67] - - [128, 256, 1, 3328] - - [636, 3531.57] + - [772, 3531.57] - - [1408, 5056, 1, 3328] - - [692, 7969.94] + - [828, 7969.94] - - [1856, 1856, 1, 3328] - - [679, 8140.34] + - [815, 8140.34] - - [3584, 128, 1, 256] - - [689, 4861.05] + - [825, 4861.05] - - [448, 1408, 1, 3328] - - [679, 6353.75] + - [815, 6353.75] - - [2368, 2368, 1, 256] - - [693, 8369.37] + - [829, 8369.37] - - [4288, 4288, 1, 1280] - - [683, 8666.52] + - [819, 8666.52] - - [64, 448, 1, 1280] - - [636, 2591.92] + - [772, 2591.92] - - [5888, 1024, 1, 1280] - - [676, 8526.6] + - [812, 8526.6] - - [704, 1024, 1, 256] - - [689, 4971.8] + - [825, 4971.8] - - [1024, 12544, 1, 256] - - [727, 8611.9] + - [863, 8611.9] - - [448, 4, 1, 256] - - [632, 78.6534] + - [768, 78.6534] - - [5888, 448, 1, 128] - - [663, 3592.03] + - [799, 3592.03] - - [512, 48000, 1, 2560] - - [697, 9237.44] + - [833, 9237.44] - - [8448, 16, 1, 2816] - - [562, 3360.21] + - [698, 3360.21] - - [704, 6784, 1, 3328] - - [698, 7774.95] + - [834, 7774.95] - - [5888, 5888, 1, 1280] - - [684, 9238.25] + - [820, 9238.25] - - [5056, 1024, 1, 1280] - - [712, 8227.88] + - [848, 8227.88] - - [448, 5888, 1, 3328] - - [687, 7777.63] + - [823, 7777.63] - - [3072, 2, 1, 1024] - - [624, 376.383] + - [760, 376.383] - - [1024, 2944, 1, 1280] - - [677, 8650.45] + - [813, 8650.45] - - [5056, 5888, 1, 1280] - - [687, 8861.6] + - [823, 8861.6] - - [4288, 5888, 1, 128] - - [667, 5049.01] + - [803, 5049.01] - - [256, 3584, 1, 256] - - [679, 6314.11] + - [815, 6314.11] - - [256, 4, 1, 1280] - - [723, 163.94] + - [859, 163.94] - - [1408, 3584, 1, 128] - - [667, 4290.22] + - [803, 4290.22] - - [256, 2944, 1, 3328] - - [689, 7620.99] + - [825, 7620.99] - - [448, 3584, 1, 128] - - [667, 3353.9] + - [803, 3353.9] - - [5888, 2944, 1, 1280] - - [677, 9498.31] + - [813, 9498.31] - - [4, 6784, 1, 1280] - - [627, 623.916] + - [763, 623.916] - - [2368, 5888, 1, 128] - - [666, 4840.29] + - [802, 4840.29] - - [35, 8457, 1, 1760] - - [573, 4059.88] + - [709, 4059.88] - - [64, 2944, 1, 128] - - [584, 1310.82] + - [720, 1310.82] - - [2368, 4, 1, 256] - - [718, 369.739] + - [854, 369.739] - - [3584, 5888, 1, 256] - - [695, 7996.33] + - [831, 7996.33] - - [2368, 1024, 1, 128] - - [661, 3915.07] + - [797, 3915.07] - - [2368, 704, 1, 128] - - [661, 3658.97] + - [797, 3658.97] - - [512, 32, 1, 512] - - [650, 1127.6] + - [786, 1127.6] - - [3584, 2368, 1, 128] - - [661, 4462.48] + - [797, 4462.48] - - [5056, 704, 1, 128] - - [660, 4062.21] + - [796, 4062.21] - - [448, 2368, 1, 128] - - [661, 2829.07] + - [797, 2829.07] - - [4, 5056, 1, 256] - - [609, 425.868] + - [745, 425.868] - - [5056, 1408, 1, 3328] - - [694, 8848.92] + - [830, 8848.92] - - [1408, 704, 1, 256] - - [689, 5394.56] + - [825, 5394.56] - - [6784, 1024, 1, 3328] - - [676, 9232.02] + - [812, 9232.02] - - [6784, 2944, 1, 3328] - - [687, 8714.84] + - [823, 8714.84] - - [7680, 1, 1, 2560] - - [623, 248.845] + - [759, 248.845] - - [1856, 1856, 1, 256] - - [688, 7586.58] + - [824, 7586.58] - - [64, 64, 1, 3328] - - [658, 1363.25] + - [794, 1363.25] - - [512, 1, 1, 512] - - [572, 43.2158] + - [708, 43.2158] - - [6784, 2368, 1, 1280] - - [689, 8665.74] + - [825, 8665.74] - - [4608, 2, 1, 1536] - - [572, 452.65] + - [708, 452.65] - - [4288, 3584, 1, 256] - - [697, 8936.7] + - [833, 8936.7] - - [4288, 5888, 1, 1280] - - [694, 8957.15] + - [830, 8957.15] - - [4608, 4, 1, 1536] - - [565, 846.737] + - [701, 846.737] - - [1024, 6000, 1, 1536] - - [687, 8398.54] + - [823, 8398.54] - - [8448, 32, 1, 2816] - - [647, 5343.07] + - [783, 5343.07] - - [448, 2944, 1, 3328] - - [694, 7247.04] + - [830, 7247.04] - - [4288, 1856, 1, 1280] - - [677, 8902.86] + - [813, 8902.86] - - [1856, 2944, 1, 3328] - - [689, 8622.86] + - [825, 8622.86] - - [256, 6784, 1, 3328] - - [689, 8050.77] + - [825, 8050.77] - - [512, 3000, 1, 1536] - - [710, 7108.12] + - [846, 7108.12] - - [64, 5888, 1, 256] - - [702, 3567.74] + - [838, 3567.74] - - [256, 5056, 1, 128] - - [669, 3041.12] + - [805, 3041.12] - - [5056, 1024, 1, 256] - - [693, 8401.47] + - [829, 8401.47] - - [704, 64, 1, 3328] - - [652, 4299.02] + - [788, 4299.02] - - [5056, 1856, 1, 3328] - - [697, 8660.77] + - [833, 8660.77] - - [4, 2944, 1, 3328] - - [627, 618.637] + - [763, 618.637] - - [512, 1500, 1, 2048] - - [709, 5481.22] + - [845, 5481.22] - - [1024, 1, 1, 500000] - - [563, 260.061] + - [699, 260.061] - - [256, 4, 1, 256] - - [627, 50.5123] + - [763, 50.5123] - - [6784, 128, 1, 3328] - - [681, 6950.91] + - [817, 6950.91] - - [4288, 1408, 1, 128] - - [661, 4539.58] + - [797, 4539.58] - - [1856, 5888, 1, 3328] - - [687, 8712.93] + - [823, 8712.93] - - [4288, 5056, 1, 256] - - [693, 8997.15] + - [829, 8997.15] - - [1408, 128, 1, 1280] - - [616, 4599.12] + - [752, 4599.12] - - [4096, 7000, 1, 4096] - - [683, 8555.89] + - [819, 8555.89] - - [5056, 256, 1, 3328] - - [689, 8257.16] + - [825, 8257.16] - - [704, 704, 1, 256] - - [679, 5852.39] + - [815, 5852.39] - - [1024, 3000, 1, 2560] - - [676, 8258.84] + - [812, 8258.84] - - [1024, 5888, 1, 1280] - - [676, 8988.99] + - [812, 8988.99] - - [6784, 2368, 1, 128] - - [662, 4562.25] + - [798, 4562.25] - - [4, 5056, 1, 1280] - - [627, 600.441] + - [763, 600.441] - - [256, 64, 1, 1280] - - [650, 1899.69] + - [786, 1899.69] - - [128, 1856, 1, 1280] - - [689, 5185.76] + - [825, 5185.76] - - [1856, 1024, 1, 1280] - - [694, 7875.95] + - [830, 7875.95] - - [6784, 4288, 1, 1280] - - [697, 8981.18] + - [833, 8981.18] - - [1856, 1856, 1, 1280] - - [678, 7794.71] + - [814, 7794.71] - - [35, 1500, 1, 2048] - - [578, 2192.6] + - [714, 2192.6] - - [3072, 24000, 1, 1024] - - [690, 8690.58] + - [826, 8690.58] - - [1408, 5056, 1, 1280] - - [689, 8427.87] + - [825, 8427.87] - - [4, 2368, 1, 3328] - - [632, 594.422] + - [768, 594.422] - - [5888, 1856, 1, 128] - - [661, 4294.05] + - [797, 4294.05] - - [448, 704, 1, 1280] - - [684, 4136.39] + - [820, 4136.39] - - [448, 6784, 1, 128] - - [662, 3976.2] + - [798, 3976.2] - - [1024, 448, 1, 3328] - - [694, 6376.33] + - [830, 6376.33] - - [2944, 128, 1, 256] - - [679, 4466.26] + - [815, 4466.26] - - [5056, 3584, 1, 128] - - [667, 4997.18] + - [803, 4997.18] - - [5888, 5888, 1, 3328] - - [697, 8870.37] + - [833, 8870.37] - - [6784, 1024, 1, 256] - - [676, 8520.53] + - [812, 8520.53] - - [2944, 2368, 1, 256] - - [713, 6174.59] + - [849, 6174.59] - - [256, 448, 1, 256] - - [689, 1844.33] + - [825, 1844.33] - - [5056, 5888, 1, 3328] - - [678, 8076.65] + - [814, 8076.65] - - [1856, 1024, 1, 256] - - [689, 7188.92] + - [825, 7188.92] - - [512, 48000, 1, 1536] - - [700, 7282.2] + - [836, 7282.2] - - [3584, 448, 1, 1280] - - [679, 6869.1] + - [815, 6869.1] - - [1024, 1024, 1, 1280] - - [689, 8027.45] + - [825, 8027.45] - - [448, 5888, 1, 256] - - [679, 5765.84] + - [815, 5765.84] - - [2048, 128, 1, 2048] - - [637, 4835.01] + - [773, 4835.01] - - [1408, 6784, 1, 3328] - - [689, 8613.76] + - [825, 8613.76] - - [448, 1024, 1, 128] - - [660, 2315.57] + - [796, 2315.57] - - [4288, 704, 1, 128] - - [661, 4138.92] + - [797, 4138.92] - - [128, 1856, 1, 128] - - [596, 1397.56] + - [732, 1397.56] - - [448, 2368, 1, 3328] - - [679, 6786.48] + - [815, 6786.48] - - [5056, 64, 1, 128] - - [661, 1664.84] + - [797, 1664.84] - - [5056, 2944, 1, 256] - - [712, 7697.49] + - [848, 7697.49] - - [6784, 5888, 1, 128] - - [661, 5003.67] + - [797, 5003.67] - - [1024, 700, 1, 512] - - [689, 6036.31] + - [825, 6036.31] - - [3072, 1, 1, 128] - - [643, 70.3171] + - [779, 70.3171] - - [1024, 4, 1, 256] - - [601, 154.302] + - [737, 154.302] - - [2944, 704, 1, 128] - - [667, 3697.0] + - [803, 3697.0] - - [128, 6784, 1, 1280] - - [679, 6731.51] + - [815, 6731.51] - - [1408, 3584, 1, 3328] - - [677, 9258.07] + - [813, 9258.07] - - [2368, 6784, 1, 256] - - [676, 8840.4] + - [812, 8840.4] - - [5056, 1408, 1, 1280] - - [677, 9240.84] + - [813, 9240.84] - - [5056, 4288, 1, 128] - - [672, 4309.18] + - [808, 4309.18] - - [4, 704, 1, 256] - - [627, 130.697] + - [763, 130.697] - - [4288, 2368, 1, 3328] - - [690, 8755.33] + - [826, 8755.33] - - [1408, 1856, 1, 128] - - [660, 3918.75] + - [796, 3918.75] - - [1408, 5888, 1, 3328] - - [697, 8910.47] + - [833, 8910.47] - - [1856, 256, 1, 256] - - [679, 5631.34] + - [815, 5631.34] - - [6784, 6784, 1, 256] - - [687, 9298.76] + - [823, 9298.76] - - [5888, 5056, 1, 128] - - [662, 4811.36] + - [798, 4811.36] - - [4288, 2368, 1, 128] - - [661, 4749.1] + - [797, 4749.1] - - [128, 5888, 1, 1280] - - [688, 6393.86] + - [824, 6393.86] - - [256, 4288, 1, 1280] - - [679, 6887.79] + - [815, 6887.79] - - [2368, 2944, 1, 256] - - [693, 8314.82] + - [829, 8314.82] - - [4, 1856, 1, 256] - - [716, 267.03] + - [852, 267.03] - - [3584, 1856, 1, 1280] - - [677, 8631.91] + - [813, 8631.91] - - [6784, 6784, 1, 128] - - [667, 5059.96] + - [803, 5059.96] - - [256, 1856, 1, 128] - - [660, 1858.82] + - [796, 1858.82] - - [49, 512, 64, 2048] - - [731, 3053.67] + - [867, 3053.67] - - [704, 64, 1, 1280] - - [610, 2849.49] + - [746, 2849.49] - - [5888, 5056, 1, 256] - - [696, 8202.52] + - [832, 8202.52] - - [8448, 48000, 1, 2816] - - [687, 4281.94] + - [823, 4281.94] - - [512, 6000, 1, 2048] - - [679, 8047.89] + - [815, 8047.89] - - [3584, 448, 1, 256] - - [689, 6805.43] + - [825, 6805.43] - - [448, 4288, 1, 128] - - [667, 3500.83] + - [803, 3500.83] - - [7680, 64, 1, 2560] - - [622, 5957.9] + - [758, 5957.9] - - [256, 6784, 1, 256] - - [689, 7331.83] + - [825, 7331.83] - - [1408, 4288, 1, 128] - - [661, 4501.49] + - [797, 4501.49] - - [2944, 704, 1, 3328] - - [689, 8439.7] + - [825, 8439.7] - - [128, 448, 1, 256] - - [610, 1555.19] + - [746, 1555.19] - - [2048, 32, 1, 2048] - - [621, 3226.49] + - [757, 3226.49] - - [3584, 3584, 1, 256] - - [693, 8784.9] + - [829, 8784.9] - - [448, 1408, 1, 128] - - [660, 2535.92] + - [796, 2535.92] - - [128, 256, 1, 1280] - - [636, 2896.72] + - [772, 2896.72] - - [3584, 5056, 1, 256] - - [680, 8566.52] + - [816, 8566.52] - - [6784, 128, 1, 256] - - [679, 6053.97] + - [815, 6053.97] - - [4288, 4, 1, 256] - - [599, 428.9] + - [735, 428.9] - - [64, 1408, 1, 3328] - - [604, 5025.11] + - [740, 5025.11] - - [704, 448, 1, 256] - - [703, 3409.74] + - [839, 3409.74] - - [2944, 2368, 1, 1280] - - [677, 9066.35] + - [813, 9066.35] - - [448, 64, 1, 3328] - - [652, 3528.96] + - [788, 3528.96] - - [704, 6784, 1, 128] - - [666, 4212.61] + - [802, 4212.61] - - [3584, 4, 1, 3328] - - [719, 658.353] + - [855, 658.353] - - [6784, 3584, 1, 256] - - [687, 9061.84] + - [823, 9061.84] - - [704, 448, 1, 128] - - [666, 1552.8] + - [802, 1552.8] - - [256, 128, 1, 128] - - [591, 281.975] + - [727, 281.975] - - [704, 1408, 1, 128] - - [666, 3026.76] + - [802, 3026.76] - - [4, 448, 1, 128] - - [715, 5.56127] + - [851, 5.56127] - - [4288, 128, 1, 1280] - - [646, 5471.64] + - [782, 5471.64] - - [128, 1408, 1, 256] - - [689, 2813.35] + - [825, 2813.35] - - [4, 2944, 1, 256] - - [609, 316.766] + - [745, 316.766] - - [64, 128, 1, 3328] - - [657, 1872.56] + - [793, 1872.56] - - [1856, 1408, 1, 256] - - [679, 7735.89] + - [815, 7735.89] - - [5056, 2368, 1, 128] - - [661, 4830.19] + - [797, 4830.19] - - [2944, 2944, 1, 3328] - - [697, 8890.11] + - [833, 8890.11] - - [5056, 6784, 1, 256] - - [687, 9015.25] + - [823, 9015.25] - - [1856, 3584, 1, 128] - - [668, 4455.12] + - [804, 4455.12] - - [5888, 4, 1, 1280] - - [717, 642.063] + - [853, 642.063] - - [128, 2944, 1, 128] - - [586, 2037.03] + - [722, 2037.03] - - [35, 8457, 1, 2560] - - [574, 3988.23] + - [710, 3988.23] - - [3584, 6784, 1, 128] - - [661, 4774.54] + - [797, 4774.54] - - [128, 4288, 1, 256] - - [679, 4851.85] + - [815, 4851.85] - - [704, 448, 1, 3328] - - [694, 4432.63] + - [830, 4432.63] - - [2368, 6784, 1, 1280] - - [677, 9161.48] + - [813, 9161.48] - - [128, 128, 1, 3328] - - [651, 2839.99] + - [787, 2839.99] - - [5056, 1856, 1, 256] - - [693, 8380.94] + - [829, 8380.94] - - [256, 128, 1, 256] - - [635, 1165.18] + - [771, 1165.18] - - [1024, 3000, 1, 2816] - - [694, 8714.27] + - [830, 8714.27] - - [1024, 1856, 1, 256] - - [684, 7014.79] + - [820, 7014.79] - - [64, 1, 1, 1216] - - [657, 11.8205] + - [793, 11.8205] - - [4288, 64, 1, 128] - - [588, 1669.65] + - [724, 1669.65] - - [256, 448, 1, 3328] - - [612, 5152.39] + - [748, 5152.39] - - [1408, 6784, 1, 1280] - - [697, 8735.22] + - [833, 8735.22] - - [3584, 3584, 1, 1280] - - [694, 9020.09] + - [830, 9020.09] - - [7680, 24000, 1, 2560] - - [697, 6940.24] + - [833, 6940.24] - - [64, 2368, 1, 1280] - - [607, 4433.07] + - [743, 4433.07] - - [448, 2368, 1, 1280] - - [682, 5352.92] + - [818, 5352.92] - - [4608, 48000, 1, 1536] - - [676, 8129.11] + - [812, 8129.11] - - [5888, 5888, 1, 128] - - [669, 4700.91] + - [805, 4700.91] - - [64, 6784, 1, 3328] - - [679, 6170.82] + - [815, 6170.82] - - [2944, 256, 1, 1280] - - [709, 6177.65] + - [845, 6177.65] - - [2048, 16, 1, 2048] - - [631, 2167.7] + - [767, 2167.7] - - [256, 2368, 1, 128] - - [660, 2037.77] + - [796, 2037.77] - - [5056, 2368, 1, 3328] - - [677, 9040.6] + - [813, 9040.6] - - [2944, 4288, 1, 256] - - [708, 7552.22] + - [844, 7552.22] - - [1408, 3584, 1, 1280] - - [684, 8808.76] + - [820, 8808.76] - - [2368, 64, 1, 256] - - [620, 2320.51] + - [756, 2320.51] - - [1024, 128, 1, 128] - - [580, 1075.56] + - [716, 1075.56] - - [704, 128, 1, 3328] - - [613, 4985.02] + - [749, 4985.02] - - [5888, 4, 1, 128] - - [714, 33.6558] + - [850, 33.6558] - - [1856, 704, 1, 256] - - [689, 7110.98] + - [825, 7110.98] - - [1024, 1500, 1, 2816] - - [684, 8499.88] + - [820, 8499.88] - - [8448, 1, 1, 2816] - - [567, 251.469] + - [703, 251.469] - - [1024, 4, 1, 3328] - - [723, 541.032] + - [859, 541.032] - - [1024, 6000, 1, 2048] - - [684, 8698.59] + - [820, 8698.59] - - [512, 24000, 1, 2560] - - [677, 8963.7] + - [813, 8963.7] - - [6144, 3000, 1, 2560] - - [700, 8761.97] + - [836, 8761.97] - - [2368, 6784, 1, 3328] - - [694, 8867.49] + - [830, 8867.49] - - [1856, 1408, 1, 1280] - - [681, 7908.53] + - [817, 7908.53] - - [1856, 448, 1, 1280] - - [694, 6544.01] + - [830, 6544.01] - - [6784, 704, 1, 128] - - [660, 4086.45] + - [796, 4086.45] - - [4, 4, 1, 256] - - [627, 0.852941] + - [763, 0.852941] - - [128, 5888, 1, 128] - - [584, 2582.25] + - [720, 2582.25] - - [5056, 2944, 1, 128] - - [664, 4579.17] + - [800, 4579.17] - - [1408, 5888, 1, 256] - - [676, 8810.77] + - [812, 8810.77] - - [704, 2944, 1, 1280] - - [691, 8420.9] + - [827, 8420.9] - - [4288, 64, 1, 1280] - - [616, 4906.15] + - [752, 4906.15] - - [256, 64, 1, 256] - - [618, 689.953] + - [754, 689.953] - - [1024, 1024, 1, 256] - - [694, 5528.01] + - [830, 5528.01] - - [704, 1856, 1, 256] - - [678, 4452.92] + - [814, 4452.92] - - [2560, 64, 1, 2560] - - [607, 4563.09] + - [743, 4563.09] - - [3584, 704, 1, 1280] - - [684, 7898.77] + - [820, 7898.77] - - [256, 128, 1, 1280] - - [636, 2865.06] + - [772, 2865.06] - - [5888, 2368, 1, 256] - - [683, 8628.37] + - [819, 8628.37] - - [256, 2368, 1, 1280] - - [679, 6073.57] + - [815, 6073.57] - - [2944, 6784, 1, 128] - - [660, 4756.77] + - [796, 4756.77] - - [3584, 448, 1, 3328] - - [679, 7265.07] + - [815, 7265.07] - - [1408, 4, 1, 256] - - [720, 234.157] + - [856, 234.157] - - [704, 2368, 1, 3328] - - [677, 7248.98] + - [813, 7248.98] - - [2944, 448, 1, 256] - - [684, 6365.89] + - [820, 6365.89] - - [1856, 448, 1, 128] - - [662, 2976.34] + - [798, 2976.34] - - [4608, 6000, 1, 1536] - - [697, 9469.42] + - [833, 9469.42] - - [2368, 128, 1, 1280] - - [646, 4773.39] + - [782, 4773.39] - - [256, 5888, 1, 128] - - [661, 3112.0] + - [797, 3112.0] - - [64, 6784, 1, 256] - - [679, 3755.14] + - [815, 3755.14] - - [64, 5056, 1, 1280] - - [640, 4935.6] + - [776, 4935.6] - - [4, 6784, 1, 128] - - [715, 111.142] + - [851, 111.142] - - [3025, 64, 64, 64] - - [729, 6643.75] + - [865, 6643.75] - - [2944, 2944, 1, 1280] - - [677, 8869.55] + - [813, 8869.55] - - [5056, 448, 1, 3328] - - [710, 6706.2] + - [846, 6706.2] - - [4, 3584, 1, 1280] - - [627, 573.54] + - [763, 573.54] - - [1408, 128, 1, 128] - - [579, 1293.19] + - [715, 1293.19] - - [6784, 704, 1, 3328] - - [694, 8368.33] + - [830, 8368.33] - - [128, 64, 1, 1280] - - [653, 1260.41] + - [789, 1260.41] - - [2368, 256, 1, 1280] - - [679, 6154.47] + - [815, 6154.47] - - [4, 448, 1, 3328] - - [632, 351.738] + - [768, 351.738] - - [5888, 4288, 1, 128] - - [661, 4340.99] + - [797, 4340.99] - - [4, 5888, 1, 256] - - [609, 428.318] + - [745, 428.318] - - [1408, 2944, 1, 3328] - - [676, 9400.85] + - [812, 9400.85] - - [3584, 704, 1, 128] - - [663, 3392.55] + - [799, 3392.55] - - [64, 1024, 1, 256] - - [610, 1762.41] + - [746, 1762.41] - - [2368, 448, 1, 1280] - - [703, 5972.58] + - [839, 5972.58] - - [128, 3584, 1, 256] - - [679, 5224.32] + - [815, 5224.32] - - [704, 448, 1, 1280] - - [679, 4566.86] + - [815, 4566.86] - - [448, 5056, 1, 128] - - [661, 3876.19] + - [797, 3876.19] - - [6144, 4, 1, 2560] - - [603, 948.751] + - [739, 948.751] - - [5056, 3584, 1, 256] - - [693, 8162.56] + - [829, 8162.56] - - [4288, 4288, 1, 256] - - [700, 7653.34] + - [836, 7653.34] - - [1408, 5056, 1, 128] - - [667, 4554.34] + - [803, 4554.34] - - [2944, 3584, 1, 128] - - [673, 4147.0] + - [809, 4147.0] - - [3584, 2368, 1, 256] - - [694, 8195.05] + - [830, 8195.05] - - [5888, 5056, 1, 1280] - - [693, 9413.43] + - [829, 9413.43] - - [128, 1024, 1, 1280] - - [646, 4433.83] + - [782, 4433.83] - - [8448, 24000, 1, 2816] - - [687, 5227.12] + - [823, 5227.12] - - [64, 704, 1, 256] - - [610, 1441.89] + - [746, 1441.89] - - [4288, 256, 1, 1280] - - [709, 5687.8] + - [845, 5687.8] - - [3584, 3584, 1, 3328] - - [684, 9183.63] + - [820, 9183.63] - - [704, 64, 1, 128] - - [588, 402.835] + - [724, 402.835] - - [3072, 1500, 1, 128] - - [683, 7395.08] + - [819, 7395.08] - - [2048, 3136, 1, 512] - - [725, 8447.3] + - [861, 8447.3] - - [3025, 256, 64, 64] - - [733, 8063.79] + - [869, 8063.79] - - [5888, 6784, 1, 256] - - [677, 9282.01] + - [813, 9282.01] - - [4288, 2944, 1, 3328] - - [677, 9153.87] + - [813, 9153.87] - - [2944, 64, 1, 128] - - [594, 1463.53] + - [730, 1463.53] - - [1024, 128, 1, 3328] - - [644, 5377.41] + - [780, 5377.41] - - [1024, 16, 1, 500000] - - [560, 3997.13] + - [696, 3997.13] - - [4288, 128, 1, 3328] - - [648, 6053.31] + - [784, 6053.31] - - [7680, 128, 1, 2560] - - [694, 7769.24] + - [830, 7769.24] - - [256, 5056, 1, 1280] - - [703, 7200.84] + - [839, 7200.84] - - [1408, 256, 1, 128] - - [671, 1671.74] + - [807, 1671.74] - - [2944, 5888, 1, 3328] - - [683, 8642.18] + - [819, 8642.18] - - [6784, 5888, 1, 1280] - - [697, 8871.15] + - [833, 8871.15] - - [3072, 1, 1, 1024] - - [643, 205.972] + - [779, 205.972] - - [704, 128, 1, 256] - - [606, 1935.39] + - [742, 1935.39] - - [5888, 4288, 1, 1280] - - [684, 9176.7] + - [820, 9176.7] - - [1024, 24000, 1, 2048] - - [683, 8667.79] + - [819, 8667.79] - - [448, 256, 1, 1280] - - [616, 4327.95] + - [752, 4327.95] - - [5888, 3584, 1, 128] - - [661, 4669.45] + - [797, 4669.45] - - [64, 4288, 1, 3328] - - [641, 5375.04] + - [777, 5375.04] - - [448, 4, 1, 1280] - - [632, 289.716] + - [768, 289.716] - - [6784, 6784, 1, 3328] - - [690, 8306.73] + - [826, 8306.73] - - [5056, 4, 1, 1280] - - [602, 607.199] + - [738, 607.199] - - [4, 5888, 1, 3328] - - [627, 651.538] + - [763, 651.538] - - [256, 1408, 1, 1280] - - [679, 5177.09] + - [815, 5177.09] - - [3072, 16, 1, 1024] - - [638, 2207.63] + - [774, 2207.63] - - [704, 3584, 1, 128] - - [671, 3653.51] + - [807, 3653.51] - - [1024, 2, 1, 512] - - [658, 156.138] + - [794, 156.138] - - [5888, 448, 1, 3328] - - [679, 7896.85] + - [815, 7896.85] - - [2368, 4288, 1, 1280] - - [676, 8517.63] + - [812, 8517.63] - - [4288, 2944, 1, 128] - - [665, 4439.26] + - [801, 4439.26] - - [256, 64, 1, 3328] - - [651, 2704.76] + - [787, 2704.76] - - [2944, 64, 1, 3328] - - [616, 5647.15] + - [752, 5647.15] - - [6784, 64, 1, 3328] - - [689, 6434.61] + - [825, 6434.61] - - [5056, 2944, 1, 3328] - - [700, 8497.2] + - [836, 8497.2] - - [448, 128, 1, 256] - - [618, 1516.64] + - [754, 1516.64] - - [2944, 3584, 1, 256] - - [694, 8365.83] + - [830, 8365.83] - - [1408, 1408, 1, 3328] - - [677, 8440.42] + - [813, 8440.42] - - [1856, 128, 1, 1280] - - [679, 5242.93] + - [815, 5242.93] - - [3584, 3584, 1, 128] - - [661, 4385.94] + - [797, 4385.94] - - [64, 3584, 1, 256] - - [679, 3276.9] + - [815, 3276.9] - - [1408, 4, 1, 3328] - - [602, 605.504] + - [738, 605.504] - - [128, 2944, 1, 3328] - - [647, 6295.75] + - [783, 6295.75] - - [3584, 704, 1, 256] - - [684, 7711.64] + - [820, 7711.64] - - [2944, 448, 1, 3328] - - [695, 6503.97] + - [831, 6503.97] - - [1024, 2, 1, 500000] - - [564, 521.803] + - [700, 521.803] - - [3584, 1408, 1, 3328] - - [686, 8296.2] + - [822, 8296.2] - - [704, 3584, 1, 1280] - - [691, 7670.65] + - [827, 7670.65] - - [1024, 1408, 1, 128] - - [666, 2830.61] + - [802, 2830.61] - - [1856, 6784, 1, 256] - - [697, 8149.67] + - [833, 8149.67] - - [4288, 448, 1, 3328] - - [678, 7406.44] + - [814, 7406.44] - - [6784, 4288, 1, 128] - - [673, 4418.09] + - [809, 4418.09] - - [6784, 704, 1, 1280] - - [694, 8302.45] + - [830, 8302.45] - - [6144, 1, 1, 2560] - - [603, 243.427] + - [739, 243.427] - - [3584, 6784, 1, 256] - - [676, 9036.59] + - [812, 9036.59] - - [6144, 16, 1, 2560] - - [610, 3266.69] + - [746, 3266.69] - - [3584, 64, 1, 128] - - [594, 1555.19] + - [730, 1555.19] - - [5888, 1024, 1, 3328] - - [684, 8888.08] + - [820, 8888.08] - - [448, 64, 1, 128] - - [580, 248.074] + - [716, 248.074] - - [704, 6784, 1, 1280] - - [680, 7892.56] + - [816, 7892.56] - - [4, 448, 1, 256] - - [602, 70.8951] + - [738, 70.8951] - - [196, 1024, 64, 256] - - [728, 6630.86] + - [864, 6630.86] - - [5888, 128, 1, 256] - - [678, 5715.09] + - [814, 5715.09] - - [4096, 16, 1, 4096] - - [624, 3251.5] + - [760, 3251.5] - - [1856, 5056, 1, 3328] - - [693, 8740.27] + - [829, 8740.27] - - [4, 6784, 1, 256] - - [716, 360.412] + - [852, 360.412] - - [1024, 3584, 1, 128] - - [661, 3456.27] + - [797, 3456.27] - - [64, 704, 1, 3328] - - [629, 3817.47] + - [765, 3817.47] - - [2368, 2944, 1, 128] - - [667, 4605.47] + - [803, 4605.47] - - [5056, 64, 1, 256] - - [679, 3863.79] + - [815, 3863.79] - - [512, 1500, 1, 1536] - - [679, 6801.56] + - [815, 6801.56] - - [512, 1, 1, 500000] - - [568, 261.068] + - [704, 261.068] - - [5888, 2944, 1, 3328] - - [683, 8501.88] + - [819, 8501.88] - - [128, 3584, 1, 1280] - - [684, 5938.64] + - [820, 5938.64] - - [1024, 704, 1, 128] - - [670, 2172.29] + - [806, 2172.29] - - [1408, 2368, 1, 128] - - [666, 4023.2] + - [802, 4023.2] - - [5888, 2368, 1, 128] - - [667, 4424.62] + - [803, 4424.62] - - [128, 5056, 1, 3328] - - [679, 6692.16] + - [815, 6692.16] - - [3584, 6784, 1, 1280] - - [677, 9488.64] + - [813, 9488.64] - - [4288, 1856, 1, 256] - - [687, 8287.52] + - [823, 8287.52] - - [1856, 5888, 1, 256] - - [698, 7707.83] + - [834, 7707.83] - - [256, 256, 1, 256] - - [645, 1613.29] + - [781, 1613.29] - - [4288, 4288, 1, 3328] - - [687, 8923.59] + - [823, 8923.59] - - [1024, 1024, 1, 128] - - [667, 2553.71] + - [803, 2553.71] - - [4288, 1408, 1, 1280] - - [687, 8930.47] + - [823, 8930.47] - - [3584, 5056, 1, 128] - - [671, 4495.15] + - [807, 4495.15] - - [4, 1024, 1, 3328] - - [627, 415.694] + - [763, 415.694] - - [4, 704, 1, 128] - - [715, 13.9634] + - [851, 13.9634] - - [4288, 2368, 1, 256] - - [712, 7135.08] + - [848, 7135.08] - - [2944, 5056, 1, 1280] - - [684, 9118.61] + - [820, 9118.61] - - [448, 6784, 1, 256] - - [708, 5430.31] + - [844, 5430.31] - - [64, 128, 1, 128] - - [591, 83.057] + - [727, 83.057] - - [1856, 2368, 1, 128] - - [667, 4422.75] + - [803, 4422.75] - - [6784, 2368, 1, 3328] - - [680, 8769.4] + - [816, 8769.4] - - [1408, 6784, 1, 128] - - [667, 4739.0] + - [803, 4739.0] - - [256, 1024, 1, 1280] - - [689, 5722.21] + - [825, 5722.21] - - [704, 4, 1, 128] - - [715, 8.66578] + - [851, 8.66578] - - [1408, 4, 1, 128] - - [715, 26.1439] + - [851, 26.1439] - - [4288, 128, 1, 256] - - [689, 4865.38] + - [825, 4865.38] - - [4288, 1856, 1, 3328] - - [676, 9250.04] + - [812, 9250.04] - - [3584, 448, 1, 128] - - [667, 3029.59] + - [803, 3029.59] - - [64, 4288, 1, 128] - - [584, 1535.38] + - [720, 1535.38] - - [64, 448, 1, 3328] - - [654, 3457.36] + - [790, 3457.36] - - [448, 4, 1, 3328] - - [632, 367.328] + - [768, 367.328] - - [256, 4, 1, 3328] - - [723, 320.389] + - [859, 320.389] - - [4, 1408, 1, 1280] - - [720, 344.039] + - [856, 344.039] - - [3584, 64, 1, 1280] - - [608, 5191.07] + - [744, 5191.07] - - [1408, 448, 1, 128] - - [668, 2218.24] + - [804, 2218.24] - - [3584, 1024, 1, 1280] - - [690, 8253.11] + - [826, 8253.11] - - [1856, 5056, 1, 256] - - [708, 7552.55] + - [844, 7552.55] - - [4, 3584, 1, 256] - - [627, 325.456] + - [763, 325.456] - - [6784, 4288, 1, 3328] - - [683, 8655.34] + - [819, 8655.34] - - [4, 2944, 1, 1280] - - [627, 547.821] + - [763, 547.821] - - [1024, 4288, 1, 256] - - [684, 7788.83] + - [820, 7788.83] - - [5888, 3584, 1, 3328] - - [687, 9173.39] + - [823, 9173.39] - - [1856, 4, 1, 256] - - [718, 282.919] + - [854, 282.919] - - [4, 256, 1, 256] - - [627, 49.7485] + - [763, 49.7485] - - [5056, 3584, 1, 3328] - - [693, 8457.53] + - [829, 8457.53] - - [1408, 128, 1, 3328] - - [647, 5714.52] + - [783, 5714.52] - - [4, 64, 1, 1280] - - [723, 42.7667] + - [859, 42.7667] - - [2368, 1408, 1, 1280] - - [684, 8224.92] + - [820, 8224.92] - - [5056, 2944, 1, 1280] - - [676, 9295.13] + - [812, 9295.13] - - [8448, 6000, 1, 2816] - - [680, 8037.97] + - [816, 8037.97] - - [4, 4, 1, 128] - - [715, 0.1433898] + - [851, 0.1433898] - - [3584, 256, 1, 256] - - [679, 6116.79] + - [815, 6116.79] - - [3584, 2944, 1, 1280] - - [676, 8796.49] + - [812, 8796.49] - - [1024, 6784, 1, 256] - - [683, 8187.86] + - [819, 8187.86] - - [4, 128, 1, 256] - - [627, 30.4407] + - [763, 30.4407] - - [6784, 448, 1, 256] - - [679, 7862.3] + - [815, 7862.3] - - [5124, 9124, 1, 2048] - - [681, 8176.41] + - [817, 8176.41] - - [2944, 5056, 1, 3328] - - [676, 9328.34] + - [812, 9328.34] - - [6784, 4, 1, 128] - - [714, 204.9] + - [850, 204.9] - - [2944, 1408, 1, 128] - - [665, 3838.2] + - [801, 3838.2] - - [448, 128, 1, 3328] - - [630, 4632.16] + - [766, 4632.16] - - [64, 2944, 1, 3328] - - [647, 5663.47] + - [783, 5663.47] - - [5056, 6784, 1, 3328] - - [683, 8420.17] + - [819, 8420.17] - - [704, 2368, 1, 128] - - [667, 3321.79] + - [803, 3321.79] - - [3072, 1500, 1, 1024] - - [684, 8221.77] + - [820, 8221.77] - - [128, 2944, 1, 256] - - [679, 4550.52] + - [815, 4550.52] - - [128, 6784, 1, 128] - - [584, 2767.76] + - [720, 2767.76] - - [3584, 4288, 1, 256] - - [683, 8808.64] + - [819, 8808.64] - - [448, 1856, 1, 256] - - [688, 5166.63] + - [824, 5166.63] - - [1856, 6784, 1, 3328] - - [680, 8339.76] + - [816, 8339.76] - - [3584, 128, 1, 3328] - - [689, 6791.57] + - [825, 6791.57] - - [64, 1856, 1, 256] - - [611, 2210.03] + - [747, 2210.03] - - [64, 448, 1, 256] - - [643, 1008.35] + - [779, 1008.35] - - [5888, 4288, 1, 256] - - [683, 8869.63] + - [819, 8869.63] - - [128, 1500, 1, 1280] - - [640, 4733.54] + - [776, 4733.54] - - [5056, 1408, 1, 256] - - [681, 7523.31] + - [817, 7523.31] - - [35, 8457, 1, 4096] - - [574, 4023.17] + - [710, 4023.17] - - [64, 256, 1, 1280] - - [635, 1941.91] + - [771, 1941.91] - - [2944, 4, 1, 128] - - [714, 95.7426] + - [850, 95.7426] - - [3584, 1024, 1, 256] - - [706, 6553.68] + - [842, 6553.68] - - [512, 6000, 1, 1536] - - [680, 7357.25] + - [816, 7357.25] - - [256, 704, 1, 256] - - [679, 2912.81] + - [815, 2912.81] - - [5888, 5888, 1, 256] - - [690, 8802.7] + - [826, 8802.7] - - [4288, 1024, 1, 1280] - - [683, 8248.83] + - [819, 8248.83] - - [5888, 128, 1, 3328] - - [633, 6848.59] + - [769, 6848.59] - - [448, 6784, 1, 3328] - - [679, 8343.78] + - [815, 8343.78] - - [2944, 1408, 1, 1280] - - [676, 9229.48] + - [812, 9229.48] - - [3072, 6000, 1, 1024] - - [697, 9015.01] + - [833, 9015.01] - - [1024, 32, 1, 512] - - [618, 1498.07] + - [754, 1498.07] - - [2944, 1856, 1, 3328] - - [693, 7176.48] + - [829, 7176.48] - - [2368, 64, 1, 128] - - [584, 1206.48] + - [720, 1206.48] - - [256, 1024, 1, 128] - - [661, 1178.28] + - [797, 1178.28] - - [3584, 5888, 1, 1280] - - [683, 9023.58] + - [819, 9023.58] - - [64, 4, 1, 128] - - [715, 1.089372] + - [851, 1.089372] - - [6784, 1856, 1, 1280] - - [677, 8964.51] + - [813, 8964.51] - - [2944, 5056, 1, 256] - - [683, 8860.12] + - [819, 8860.12] - - [5888, 256, 1, 3328] - - [694, 8308.66] + - [830, 8308.66] - - [2944, 4288, 1, 128] - - [662, 4507.61] + - [798, 4507.61] - - [3584, 1408, 1, 256] - - [677, 8234.71] + - [813, 8234.71] - - [704, 3584, 1, 3328] - - [689, 7377.26] + - [825, 7377.26] - - [5056, 448, 1, 1280] - - [678, 7145.47] + - [814, 7145.47] - - [3584, 1856, 1, 3328] - - [694, 8954.81] + - [830, 8954.81] - - [64, 1408, 1, 128] - - [591, 731.974] + - [727, 731.974] - - [4288, 6784, 1, 1280] - - [683, 9166.55] + - [819, 9166.55] - - [1024, 3000, 1, 2048] - - [694, 7723.83] + - [830, 7723.83] - - [1408, 704, 1, 1280] - - [684, 7863.1] + - [820, 7863.1] - - [2944, 1024, 1, 256] - - [677, 5035.02] + - [813, 5035.02] - - [256, 64, 1, 128] - - [583, 150.757] + - [719, 150.757] - - [2368, 4288, 1, 3328] - - [681, 8568.84] + - [817, 8568.84] - - [4, 1408, 1, 256] - - [627, 219.885] + - [763, 219.885] - - [1024, 1408, 1, 1280] - - [709, 6761.13] + - [845, 6761.13] - - [64, 64, 1, 256] - - [609, 198.694] + - [745, 198.694] - - [704, 256, 1, 3328] - - [679, 4291.62] + - [815, 4291.62] - - [6784, 5056, 1, 256] - - [678, 8545.02] + - [814, 8545.02] - - [1856, 1856, 1, 128] - - [666, 4034.93] + - [802, 4034.93] - - [4288, 5888, 1, 256] - - [697, 8998.05] + - [833, 8998.05] - - [4, 704, 1, 3328] - - [632, 452.4] + - [768, 452.4] - - [35, 8457, 1, 2048] - - [575, 3375.37] + - [711, 3375.37] - - [448, 2944, 1, 256] - - [679, 6346.74] + - [815, 6346.74] - - [4, 4288, 1, 3328] - - [632, 630.978] + - [768, 630.978] - - [2944, 6784, 1, 256] - - [706, 8002.92] + - [842, 8002.92] - - [2944, 2944, 1, 128] - - [661, 4661.41] + - [797, 4661.41] - - [4, 4, 1, 1280] - - [632, 3.14762] + - [768, 3.14762] - - [1856, 3584, 1, 1280] - - [676, 8677.66] + - [812, 8677.66] - - [64, 2944, 1, 256] - - [679, 2926.95] + - [815, 2926.95] - - [3584, 1408, 1, 1280] - - [690, 8238.9] + - [826, 8238.9] - - [448, 256, 1, 128] - - [591, 1042.72] + - [727, 1042.72] - - [4288, 448, 1, 128] - - [667, 3698.82] + - [803, 3698.82] - - [5056, 256, 1, 1280] - - [684, 7058.5] + - [820, 7058.5] - - [1856, 1408, 1, 3328] - - [681, 8348.35] + - [817, 8348.35] - - [128, 128, 1, 128] - - [591, 145.736] + - [727, 145.736] - - [1024, 4288, 1, 3328] - - [677, 8042.61] + - [813, 8042.61] - - [448, 2368, 1, 256] - - [689, 5935.0] + - [825, 5935.0] - - [1024, 4, 1, 128] - - [715, 15.93] + - [851, 15.93] - - [64, 1408, 1, 1280] - - [613, 3865.49] + - [749, 3865.49] - - [64, 6784, 1, 1280] - - [709, 5629.61] + - [845, 5629.61] - - [5056, 448, 1, 256] - - [679, 7637.91] + - [815, 7637.91] - - [2944, 2368, 1, 3328] - - [687, 9112.44] + - [823, 9112.44] - - [704, 4288, 1, 3328] - - [679, 7950.2] + - [815, 7950.2] - - [1408, 128, 1, 256] - - [679, 2898.17] + - [815, 2898.17] - - [1024, 1856, 1, 1280] - - [677, 8087.51] + - [813, 8087.51] - - [6784, 1856, 1, 256] - - [708, 7538.25] + - [844, 7538.25] - - [512, 48000, 1, 2816] - - [676, 9704.21] + - [812, 9704.21] - - [512, 3000, 1, 2816] - - [678, 7621.63] + - [814, 7621.63] - - [128, 2368, 1, 3328] - - [641, 6038.94] + - [777, 6038.94] - - [1024, 5888, 1, 256] - - [693, 8185.82] + - [829, 8185.82] - - [64, 2944, 1, 1280] - - [640, 4540.24] + - [776, 4540.24] - - [6784, 1408, 1, 256] - - [693, 8574.0] + - [829, 8574.0] - - [5056, 64, 1, 3328] - - [641, 6310.97] + - [777, 6310.97] - - [128, 704, 1, 128] - - [580, 696.618] + - [716, 696.618] - - [1408, 2368, 1, 256] - - [679, 4995.06] + - [815, 4995.06] - - [1408, 1408, 1, 256] - - [676, 7552.34] + - [812, 7552.34] - - [4, 64, 1, 128] - - [714, 1.90441] + - [850, 1.90441] - - [64, 128, 1, 1280] - - [653, 1272.64] + - [789, 1272.64] - - [1024, 8, 1, 500000] - - [561, 2013.23] + - [697, 2013.23] - - [4, 2368, 1, 128] - - [715, 49.9526] + - [851, 49.9526] - - [2368, 2368, 1, 128] - - [666, 4483.8] + - [802, 4483.8] - - [64, 5888, 1, 128] - - [583, 1957.67] + - [719, 1957.67] - - [5888, 4, 1, 3328] - - [716, 638.798] + - [852, 638.798] - - [6784, 1408, 1, 128] - - [661, 4715.61] + - [797, 4715.61] - - [1408, 5056, 1, 256] - - [693, 8557.67] + - [829, 8557.67] - - [512, 50176, 1, 128] - - [724, 8809.39] + - [860, 8809.39] - - [5056, 128, 1, 3328] - - [616, 6810.66] + - [752, 6810.66] - - [128, 128, 1, 1280] - - [650, 1899.69] + - [786, 1899.69] - - [512, 2, 1, 512] - - [570, 87.4813] + - [706, 87.4813] - - [448, 704, 1, 256] - - [689, 3765.97] + - [825, 3765.97] - - [4288, 3584, 1, 128] - - [674, 4563.77] + - [810, 4563.77] - - [2944, 128, 1, 3328] - - [616, 6507.45] + - [752, 6507.45] - - [128, 5056, 1, 1280] - - [679, 6557.85] + - [815, 6557.85] - - [3584, 5056, 1, 1280] - - [676, 9407.93] + - [812, 9407.93] - - [256, 448, 1, 1280] - - [640, 4096.1] + - [776, 4096.1] - - [704, 704, 1, 128] - - [666, 2374.31] + - [802, 2374.31] - - [5056, 4, 1, 128] - - [714, 125.52] + - [850, 125.52] - - [704, 256, 1, 1280] - - [689, 4016.23] + - [825, 4016.23] - - [64, 2368, 1, 3328] - - [646, 5159.29] + - [782, 5159.29] - - [1856, 1024, 1, 128] - - [666, 3356.47] + - [802, 3356.47] - - [1856, 64, 1, 128] - - [583, 945.644] + - [719, 945.644] - - [4096, 64, 1, 4096] - - [649, 6260.24] + - [785, 6260.24] - - [1024, 24000, 1, 1536] - - [693, 9368.5] + - [829, 9368.5] - - [704, 4288, 1, 256] - - [690, 7329.39] + - [826, 7329.39] - - [5888, 2368, 1, 1280] - - [679, 8624.71] + - [815, 8624.71] - - [6784, 1856, 1, 3328] - - [683, 9012.45] + - [819, 9012.45] - - [64, 128, 1, 256] - - [609, 374.591] + - [745, 374.591] - - [2368, 5888, 1, 1280] - - [677, 9045.76] + - [813, 9045.76] - - [5888, 256, 1, 1280] - - [694, 7999.17] + - [830, 7999.17] - - [4, 5888, 1, 1280] - - [627, 615.839] + - [763, 615.839] - - [704, 128, 1, 128] - - [583, 693.269] + - [719, 693.269] - - [1024, 4, 1, 1280] - - [722, 372.464] + - [858, 372.464] - - [2368, 1856, 1, 3328] - - [694, 8246.91] + - [830, 8246.91] - - [2368, 128, 1, 128] - - [584, 1963.53] + - [720, 1963.53] - - [2944, 704, 1, 256] - - [694, 7116.24] + - [830, 7116.24] - - [5056, 128, 1, 128] - - [587, 2519.49] + - [723, 2519.49] - - [2368, 1024, 1, 3328] - - [679, 7959.13] + - [815, 7959.13] - - [35, 700, 1, 2048] - - [575, 1766.86] + - [711, 1766.86] - - [256, 704, 1, 3328] - - [679, 4296.56] + - [815, 4296.56] - - [704, 3584, 1, 256] - - [678, 7441.61] + - [814, 7441.61] - - [704, 2944, 1, 3328] - - [695, 7195.81] + - [831, 7195.81] - - [6784, 1024, 1, 128] - - [666, 4509.18] + - [802, 4509.18] - - [256, 448, 1, 128] - - [591, 838.003] + - [727, 838.003] - - [448, 1024, 1, 3328] - - [689, 6515.65] + - [825, 6515.65] - - [2944, 1024, 1, 3328] - - [684, 8751.63] + - [820, 8751.63] - - [2944, 5056, 1, 128] - - [661, 4799.73] + - [797, 4799.73] - - [2368, 256, 1, 256] - - [678, 4754.67] + - [814, 4754.67] - - [1408, 6784, 1, 256] - - [706, 7477.09] + - [842, 7477.09] - - [6784, 1408, 1, 3328] - - [684, 8968.57] + - [820, 8968.57] - - [4288, 6784, 1, 128] - - [659, 4455.74] + - [795, 4455.74] - - [1408, 2944, 1, 128] - - [671, 3862.79] + - [807, 3862.79] - - [704, 64, 1, 256] - - [610, 1441.89] + - [746, 1441.89] - - [3072, 4, 1, 1024] - - [628, 711.803] + - [764, 711.803] - - [256, 2368, 1, 3328] - - [703, 5199.73] + - [839, 5199.73] - - [6784, 2944, 1, 1280] - - [687, 8914.45] + - [823, 8914.45] - - [4288, 1856, 1, 128] - - [667, 4683.3] + - [803, 4683.3] - - [1856, 2944, 1, 128] - - [661, 4589.34] + - [797, 4589.34] - - [6784, 448, 1, 128] - - [661, 3918.53] + - [797, 3918.53] - - [64, 3584, 1, 128] - - [592, 1468.11] + - [728, 1468.11] - - [448, 5056, 1, 1280] - - [684, 7561.4] + - [820, 7561.4] - - [4288, 5056, 1, 1280] - - [676, 9304.11] + - [812, 9304.11] - - [2368, 1856, 1, 128] - - [666, 4322.17] + - [802, 4322.17] - - [128, 448, 1, 1280] - - [646, 3336.48] + - [782, 3336.48] - - [4288, 704, 1, 256] - - [689, 7834.65] + - [825, 7834.65] - - [256, 3584, 1, 128] - - [662, 2500.96] + - [798, 2500.96] - - [5888, 704, 1, 256] - - [708, 7244.49] + - [844, 7244.49] - - [3584, 1024, 1, 128] - - [673, 3169.03] + - [809, 3169.03] - - [256, 5888, 1, 3328] - - [694, 7763.47] + - [830, 7763.47] - - [1408, 4288, 1, 3328] - - [676, 9273.8] + - [812, 9273.8] - - [6784, 4288, 1, 256] - - [684, 8825.2] + - [820, 8825.2] - - [4288, 256, 1, 128] - - [663, 2621.54] + - [799, 2621.54] - - [448, 1856, 1, 3328] - - [704, 5859.8] + - [840, 5859.8] - - [5888, 256, 1, 256] - - [694, 7124.84] + - [830, 7124.84] - - [1024, 4, 1, 500000] - - [559, 1030.2] + - [695, 1030.2] - - [6784, 1024, 1, 1280] - - [676, 9083.11] + - [812, 9083.11] - - [5888, 1024, 1, 128] - - [663, 4297.16] + - [799, 4297.16] - - [1024, 128, 1, 256] - - [679, 2086.82] + - [815, 2086.82] - - [512, 16, 1, 500000] - - [560, 3921.96] + - [696, 3921.96] - - [128, 64, 1, 3328] - - [650, 1969.97] + - [786, 1969.97] - - [448, 64, 1, 256] - - [635, 1092.37] + - [771, 1092.37] - - [2368, 256, 1, 128] - - [666, 2174.84] + - [802, 2174.84] - - [6784, 3584, 1, 1280] - - [676, 9558.82] + - [812, 9558.82] - - [1024, 6784, 1, 1280] - - [685, 8637.72] + - [821, 8637.72] - - [2944, 64, 1, 1280] - - [607, 4770.13] + - [743, 4770.13] - - [1408, 2944, 1, 1280] - - [676, 9238.47] + - [812, 9238.47] - - [256, 1856, 1, 256] - - [702, 4498.43] + - [838, 4498.43] - - [1408, 2368, 1, 3328] - - [684, 8344.97] + - [820, 8344.97] - - [2944, 4, 1, 3328] - - [719, 661.209] + - [855, 661.209] - - [128, 1408, 1, 3328] - - [647, 5641.42] + - [783, 5641.42] - - [2944, 1856, 1, 128] - - [661, 4488.04] + - [797, 4488.04] - - [256, 2944, 1, 128] - - [671, 2233.18] + - [807, 2233.18] - - [256, 6784, 1, 128] - - [660, 3139.9] + - [796, 3139.9] - - [2368, 4, 1, 128] - - [715, 38.7612] + - [851, 38.7612] - - [1408, 256, 1, 3328] - - [711, 4927.67] + - [847, 4927.67] - - [1856, 4, 1, 128] - - [715, 42.3719] + - [851, 42.3719] - - [1024, 16, 1, 512] - - [627, 1115.61] + - [763, 1115.61] - - [5056, 6784, 1, 128] - - [662, 4963.45] + - [798, 4963.45] - - [4288, 5056, 1, 128] - - [660, 4928.09] + - [796, 4928.09] - - [1856, 5888, 1, 128] - - [667, 4865.15] + - [803, 4865.15] - - [7680, 2, 1, 2560] - - [603, 499.612] + - [739, 499.612] - - [3584, 1856, 1, 256] - - [693, 7978.38] + - [829, 7978.38] - - [4288, 3584, 1, 1280] - - [693, 7852.26] + - [829, 7852.26] - - [2368, 448, 1, 256] - - [708, 5238.93] + - [844, 5238.93] - - [4288, 256, 1, 3328] - - [679, 6751.34] + - [815, 6751.34] - - [1856, 704, 1, 128] - - [661, 3525.56] + - [797, 3525.56] - - [1408, 64, 1, 256] - - [620, 1884.8] + - [756, 1884.8] - - [64, 1856, 1, 128] - - [597, 888.205] + - [733, 888.205] - - [4, 256, 1, 128] - - [714, 7.38178] + - [850, 7.38178] - - [512, 16, 1, 512] - - [627, 663.756] + - [763, 663.756] - - [704, 5888, 1, 128] - - [661, 4424.55] + - [797, 4424.55] - - [6784, 3584, 1, 128] - - [663, 3823.4] + - [799, 3823.4] - - [1024, 64, 1, 256] - - [605, 1379.81] + - [741, 1379.81] - - [64, 2368, 1, 256] - - [679, 2424.93] + - [815, 2424.93] - - [5124, 1500, 1, 2048] - - [697, 8391.84] + - [833, 8391.84] - - [4288, 5056, 1, 3328] - - [683, 9274.14] + - [819, 9274.14] - - [4, 1856, 1, 1280] - - [627, 453.474] + - [763, 453.474] - - [4288, 128, 1, 128] - - [661, 2157.8] + - [797, 2157.8] - - [512, 2, 1, 500000] - - [571, 516.895] + - [707, 516.895] - - [1408, 1408, 1, 128] - - [662, 3600.49] + - [798, 3600.49] - - [7680, 16, 1, 2560] - - [642, 3542.59] + - [778, 3542.59] - - [1856, 128, 1, 128] - - [594, 1532.8] + - [730, 1532.8] - - [5056, 2368, 1, 256] - - [706, 7684.07] + - [842, 7684.07] - - [4288, 704, 1, 3328] - - [679, 7642.96] + - [815, 7642.96] - - [448, 3584, 1, 256] - - [689, 6734.07] + - [825, 6734.07] - - [2368, 64, 1, 1280] - - [640, 3962.24] + - [776, 3962.24] - - [2368, 1024, 1, 1280] - - [691, 7989.64] + - [827, 7989.64] - - [2944, 1408, 1, 3328] - - [694, 8954.66] + - [830, 8954.66] - - [6144, 1500, 1, 2560] - - [712, 8170.07] + - [848, 8170.07] - - [4224, 1, 1, 128] - - [643, 76.9] + - [779, 76.9] - - [1024, 1408, 1, 3328] - - [709, 6961.38] + - [845, 6961.38] - - [2944, 5888, 1, 1280] - - [690, 8797.53] + - [826, 8797.53] - - [8448, 2, 1, 2816] - - [565, 496.958] + - [701, 496.958] - - [1408, 4, 1, 1280] - - [720, 471.891] + - [856, 471.891] - - [5888, 3584, 1, 256] - - [697, 8246.3] + - [833, 8246.3] - - [2368, 5056, 1, 128] - - [660, 4906.9] + - [796, 4906.9] - - [1408, 1856, 1, 3328] - - [684, 9006.8] + - [820, 9006.8] - - [4, 4, 1, 3328] - - [632, 5.83793] + - [768, 5.83793] - - [5888, 5056, 1, 3328] - - [697, 8545.1] + - [833, 8545.1] - - [7680, 6000, 1, 2560] - - [690, 7996.0] + - [826, 7996.0] - - [6784, 1408, 1, 1280] - - [684, 8888.13] + - [820, 8888.13] - - [4, 1024, 1, 1280] - - [632, 302.109] + - [768, 302.109] - - [512, 3000, 1, 2560] - - [684, 7809.43] + - [820, 7809.43] - - [704, 2944, 1, 256] - - [689, 4909.24] + - [825, 4909.24] - - [4288, 64, 1, 256] - - [689, 3264.72] + - [825, 3264.72] - - [6784, 5888, 1, 3328] - - [697, 9544.52] + - [833, 9544.52] - - [2368, 4288, 1, 128] - - [660, 4873.03] + - [796, 4873.03] - - [64, 4288, 1, 1280] - - [646, 4656.42] + - [782, 4656.42] - - [6784, 64, 1, 1280] - - [679, 6230.43] + - [815, 6230.43] - - [3584, 128, 1, 128] - - [587, 2315.57] + - [723, 2315.57] - - [1024, 6784, 1, 128] - - [661, 3758.94] + - [797, 3758.94] - - [1024, 1500, 1, 1536] - - [710, 6972.0] + - [846, 6972.0] - - [1408, 64, 1, 3328] - - [613, 5079.58] + - [749, 5079.58] - - [6784, 4, 1, 256] - - [599, 487.938] + - [735, 487.938] - - [1408, 1408, 1, 1280] - - [712, 7423.31] + - [848, 7423.31] - - [256, 2368, 1, 256] - - [679, 4986.9] + - [815, 4986.9] - - [3072, 3000, 1, 1024] - - [681, 7844.01] + - [817, 7844.01] - - [448, 4288, 1, 3328] - - [680, 7204.79] + - [816, 7204.79] - - [2368, 1408, 1, 256] - - [712, 5897.96] + - [848, 5897.96] - - [704, 2368, 1, 256] - - [679, 7000.93] + - [815, 7000.93] - - [1024, 24000, 1, 2560] - - [706, 8562.31] + - [842, 8562.31] - - [2944, 448, 1, 1280] - - [694, 7155.93] + - [830, 7155.93] - - [5888, 2368, 1, 3328] - - [693, 9252.42] + - [829, 9252.42] - - [1024, 256, 1, 128] - - [675, 1255.88] + - [811, 1255.88] - - [5124, 9124, 1, 1760] - - [687, 9168.49] + - [823, 9168.49] - - [448, 1408, 1, 1280] - - [679, 6150.34] + - [815, 6150.34] - - [448, 1856, 1, 1280] - - [694, 6489.76] + - [830, 6489.76] - - [4288, 448, 1, 1280] - - [709, 6887.02] + - [845, 6887.02] - - [5888, 704, 1, 3328] - - [689, 8230.64] + - [825, 8230.64] - - [4, 1856, 1, 128] - - [715, 27.0964] + - [851, 27.0964] - - [5056, 256, 1, 128] - - [660, 3469.01] + - [796, 3469.01] - - [1856, 256, 1, 128] - - [661, 2534.16] + - [797, 2534.16] - - [128, 2368, 1, 256] - - [679, 3660.22] + - [815, 3660.22] - - [704, 4, 1, 256] - - [627, 134.596] + - [763, 134.596] - - [1024, 6784, 1, 3328] - - [681, 8482.75] + - [817, 8482.75] - - [1408, 5888, 1, 128] - - [661, 4644.52] + - [797, 4644.52] - - [4288, 4, 1, 128] - - [714, 35.8799] + - [850, 35.8799] - - [512, 3136, 1, 2048] - - [726, 6386.69] + - [862, 6386.69] - - [1408, 1024, 1, 256] - - [679, 5440.82] + - [815, 5440.82] - - [128, 64, 1, 256] - - [609, 380.019] + - [745, 380.019] - - [8448, 1500, 1, 2816] - - [676, 9155.92] + - [812, 9155.92] - - [256, 704, 1, 128] - - [661, 895.623] + - [797, 895.623] - - [2560, 7000, 1, 2560] - - [688, 8565.66] + - [824, 8565.66] - - [5888, 64, 1, 1280] - - [703, 5007.83] + - [839, 5007.83] - - [128, 4, 1, 3328] - - [722, 165.21] + - [858, 165.21] - - [5056, 6784, 1, 1280] - - [687, 9331.48] + - [823, 9331.48] - - [1024, 448, 1, 1280] - - [689, 6501.46] + - [825, 6501.46] - - [704, 5056, 1, 3328] - - [676, 8090.13] + - [812, 8090.13] - - [128, 5056, 1, 256] - - [689, 5537.37] + - [825, 5537.37] - - [3584, 5056, 1, 3328] - - [685, 8633.24] + - [821, 8633.24] - - [1856, 4, 1, 3328] - - [723, 582.814] + - [859, 582.814] - - [4, 2944, 1, 128] - - [714, 114.292] + - [850, 114.292] - - [2368, 2944, 1, 3328] - - [693, 8749.55] + - [829, 8749.55] - - [448, 448, 1, 1280] - - [617, 4694.93] + - [753, 4694.93] - - [128, 4, 1, 128] - - [714, 4.94734] + - [850, 4.94734] - - [2368, 3584, 1, 256] - - [693, 8418.59] + - [829, 8418.59] - - [4608, 3000, 1, 1536] - - [683, 9076.47] + - [819, 9076.47] - - [1024, 256, 1, 1280] - - [689, 5562.84] + - [825, 5562.84] - - [5056, 3584, 1, 1280] - - [683, 8365.09] + - [819, 8365.09] - - [5124, 9124, 1, 4096] - - [693, 8648.58] + - [829, 8648.58] - - [7680, 48000, 1, 2560] - - [687, 4098.26] + - [823, 4098.26] - - [1856, 704, 1, 1280] - - [679, 8141.04] + - [815, 8141.04] - - [1856, 2944, 1, 1280] - - [681, 8214.4] + - [817, 8214.4] - - [4608, 1500, 1, 1536] - - [689, 8424.53] + - [825, 8424.53] - - [1024, 48000, 1, 2816] - - [680, 8513.18] + - [816, 8513.18] - - [5124, 9124, 1, 2560] - - [697, 8641.24] + - [833, 8641.24] - - [128, 1024, 1, 256] - - [611, 2356.45] + - [747, 2356.45] - - [2944, 1408, 1, 256] - - [693, 8254.29] + - [829, 8254.29] - - [4288, 1408, 1, 3328] - - [687, 9138.49] + - [823, 9138.49] - - [3584, 64, 1, 3328] - - [600, 5629.62] + - [736, 5629.62] - - [5888, 2944, 1, 128] - - [661, 4119.33] + - [797, 4119.33] - - [2944, 1024, 1, 128] - - [663, 4002.96] + - [799, 4002.96] - - [128, 1, 1, 1024] - - [657, 20.0805] + - [793, 20.0805] - - [5124, 700, 1, 2048] - - [694, 7653.84] + - [830, 7653.84] - - [4, 4288, 1, 1280] - - [627, 587.749] + - [763, 587.749] - - [6784, 5056, 1, 128] - - [666, 4855.85] + - [802, 4855.85] - - [256, 1024, 1, 3328] - - [689, 6116.28] + - [825, 6116.28] - - [3584, 4, 1, 256] - - [601, 395.576] + - [737, 395.576] - - [1856, 64, 1, 3328] - - [616, 5732.6] + - [752, 5732.6] - - [4, 128, 1, 3328] - - [722, 162.689] + - [858, 162.689] - - [256, 12544, 1, 1024] - - [726, 7628.92] + - [862, 7628.92] - - [5888, 1408, 1, 3328] - - [687, 9524.43] + - [823, 9524.43] - - [448, 2944, 1, 128] - - [661, 3163.91] + - [797, 3163.91] - - [2368, 1856, 1, 256] - - [689, 8167.36] + - [825, 8167.36] - - [256, 5056, 1, 256] - - [679, 7292.13] + - [815, 7292.13] - - [5056, 5056, 1, 128] - - [667, 5043.99] + - [803, 5043.99] - - [448, 3584, 1, 3328] - - [684, 6839.56] + - [820, 6839.56] - - [4, 5056, 1, 3328] - - [632, 639.886] + - [768, 639.886] - - [256, 256, 1, 128] - - [591, 554.902] + - [727, 554.902] - - [5888, 256, 1, 128] - - [663, 3562.47] + - [799, 3562.47] - - [4, 5056, 1, 128] - - [714, 149.907] + - [850, 149.907] - - [448, 256, 1, 256] - - [610, 2121.5] + - [746, 2121.5] - - [704, 4, 1, 3328] - - [720, 455.919] + - [856, 455.919] - - [1408, 256, 1, 256] - - [679, 4352.68] + - [815, 4352.68] - - [3584, 1856, 1, 128] - - [670, 3933.23] + - [806, 3933.23] - - [4288, 4288, 1, 128] - - [661, 4888.61] + - [797, 4888.61] - - [1856, 1024, 1, 3328] - - [697, 8242.64] + - [833, 8242.64] - - [1856, 4288, 1, 128] - - [666, 4647.4] + - [802, 4647.4] - - [1024, 6000, 1, 2560] - - [691, 8526.75] + - [827, 8526.75] - - [1024, 5056, 1, 256] - - [676, 7343.83] + - [812, 7343.83] - - [5056, 5888, 1, 128] - - [665, 4053.5] + - [801, 4053.5] - - [2368, 1408, 1, 3328] - - [679, 8466.2] + - [815, 8466.2] - - [1024, 48000, 1, 1536] - - [697, 9487.74] + - [833, 9487.74] - - [5888, 448, 1, 256] - - [710, 6081.54] + - [846, 6081.54] - - [5888, 6784, 1, 128] - - [662, 4820.27] + - [798, 4820.27] - - [2368, 4, 1, 3328] - - [721, 620.628] + - [857, 620.628] - - [6784, 5056, 1, 1280] - - [706, 8525.5] + - [842, 8525.5] - - [5056, 704, 1, 1280] - - [676, 7933.06] + - [812, 7933.06] - - [1024, 48000, 1, 2560] - - [697, 8877.94] + - [833, 8877.94] - - [4608, 32, 1, 1536] - - [626, 3556.83] + - [762, 3556.83] - - [1024, 2368, 1, 128] - - [669, 2943.75] + - [805, 2943.75] - - [128, 704, 1, 256] - - [610, 2059.8] + - [746, 2059.8] - - [2368, 448, 1, 3328] - - [689, 5290.42] + - [825, 5290.42] - - [128, 5888, 1, 3328] - - [689, 7764.43] + - [825, 7764.43] - - [448, 128, 1, 1280] - - [640, 3373.28] + - [776, 3373.28] - - [6784, 4, 1, 3328] - - [599, 676.063] + - [735, 676.063] - - [4288, 4, 1, 1280] - - [632, 564.775] + - [768, 564.775] - - [1024, 64, 1, 3328] - - [646, 4293.48] + - [782, 4293.48] - - [3072, 48000, 1, 1024] - - [696, 7826.51] + - [832, 7826.51] - - [256, 4, 1, 128] - - [715, 4.93304] + - [851, 4.93304] - - [1024, 5888, 1, 128] - - [674, 3610.46] + - [810, 3610.46] - - [3584, 5888, 1, 128] - - [662, 4722.35] + - [798, 4722.35] - - [5056, 5888, 1, 256] - - [697, 9159.11] + - [833, 9159.11] - - [2368, 1024, 1, 256] - - [689, 7482.71] + - [825, 7482.71] - - [2944, 1856, 1, 256] - - [693, 8209.0] + - [829, 8209.0] - - [1856, 6784, 1, 1280] - - [689, 8205.43] + - [825, 8205.43] - - [64, 5056, 1, 128] - - [584, 2079.35] + - [720, 2079.35] - - [64, 6784, 1, 128] - - [584, 2437.58] + - [720, 2437.58] - - [448, 704, 1, 128] - - [660, 1506.45] + - [796, 1506.45] - - [4, 1024, 1, 128] - - [715, 17.3463] + - [851, 17.3463] - - [1408, 448, 1, 256] - - [679, 5545.45] + - [815, 5545.45] - - [1408, 704, 1, 128] - - [665, 2931.65] + - [801, 2931.65] - - [64, 256, 1, 3328] - - [651, 2816.52] + - [787, 2816.52] - - [8448, 3000, 1, 2816] - - [685, 8872.99] + - [821, 8872.99] - - [6784, 448, 1, 3328] - - [679, 7555.48] + - [815, 7555.48] - - [5056, 1856, 1, 1280] - - [677, 8652.36] + - [813, 8652.36] - - [1408, 1024, 1, 3328] - - [681, 7781.42] + - [817, 7781.42] - - [2368, 256, 1, 3328] - - [685, 5392.06] + - [821, 5392.06] - - [7680, 1500, 1, 2560] - - [683, 8919.72] + - [819, 8919.72] - - [5888, 3584, 1, 1280] - - [683, 9235.85] + - [819, 9235.85] - - [1856, 3584, 1, 3328] - - [694, 8348.83] + - [830, 8348.83] - - [5888, 128, 1, 1280] - - [679, 5928.61] + - [815, 5928.61] - - [1024, 2944, 1, 256] - - [710, 6630.27] + - [846, 6630.27] - - [448, 6784, 1, 1280] - - [691, 8332.45] + - [827, 8332.45] - - [256, 3584, 1, 1280] - - [681, 7140.19] + - [817, 7140.19] - - [448, 128, 1, 128] - - [583, 552.813] + - [719, 552.813] - - [704, 5056, 1, 256] - - [689, 7959.68] + - [825, 7959.68] - - [3584, 1024, 1, 3328] - - [681, 8386.84] + - [817, 8386.84] - - [2944, 1856, 1, 1280] - - [697, 7670.29] + - [833, 7670.29] - - [128, 256, 1, 128] - - [598, 258.37] + - [734, 258.37] - - [5056, 256, 1, 256] - - [689, 5736.77] + - [825, 5736.77] - - [2944, 4288, 1, 3328] - - [676, 8730.8] + - [812, 8730.8] - - [2368, 3584, 1, 3328] - - [678, 8437.71] + - [814, 8437.71] - - [2944, 704, 1, 1280] - - [689, 8342.53] + - [825, 8342.53] - - [128, 4, 1, 256] - - [609, 24.9242] + - [745, 24.9242] - - [2944, 3584, 1, 1280] - - [691, 8322.11] + - [827, 8322.11] - - [1856, 5888, 1, 1280] - - [676, 8911.91] + - [812, 8911.91] - - [256, 256, 1, 1280] - - [640, 3653.67] + - [776, 3653.67] - - [4608, 24000, 1, 1536] - - [690, 8931.06] + - [826, 8931.06] - - [4288, 1408, 1, 256] - - [677, 8338.45] + - [813, 8338.45] - - [3584, 64, 1, 256] - - [689, 3414.07] + - [825, 3414.07] - - [64, 1856, 1, 3328] - - [616, 5460.23] + - [752, 5460.23] - - [256, 1408, 1, 128] - - [660, 1424.09] + - [796, 1424.09] - - [5888, 1408, 1, 128] - - [671, 4177.88] + - [807, 4177.88] - - [4288, 2368, 1, 1280] - - [680, 8596.05] + - [816, 8596.05] - - [4, 4288, 1, 256] - - [716, 370.954] + - [852, 370.954] - - [256, 4288, 1, 128] - - [661, 2907.99] + - [797, 2907.99] - - [256, 128, 1, 3328] - - [654, 3644.88] + - [790, 3644.88] - - [512, 8, 1, 500000] - - [566, 2025.89] + - [702, 2025.89] - - [6784, 2368, 1, 256] - - [679, 8470.41] + - [815, 8470.41] - - [5888, 128, 1, 128] - - [584, 2604.55] + - [720, 2604.55] - - [1408, 448, 1, 3328] - - [689, 6540.62] + - [825, 6540.62] - - [1024, 24000, 1, 2816] - - [706, 8364.03] + - [842, 8364.03] - - [704, 1024, 1, 1280] - - [689, 7277.28] + - [825, 7277.28] - - [1856, 256, 1, 3328] - - [679, 7039.14] + - [815, 7039.14] - - [1856, 2944, 1, 256] - - [688, 8151.59] + - [824, 8151.59] - - [5056, 1024, 1, 128] - - [662, 4422.82] + - [798, 4422.82] - - [64, 5888, 1, 1280] - - [640, 4854.62] + - [776, 4854.62] - - [7680, 3000, 1, 2560] - - [693, 8789.57] + - [829, 8789.57] - - [4224, 1500, 1, 176] - - [689, 7902.14] + - [825, 7902.14] - - [5124, 700, 1, 2560] - - [679, 8232.59] + - [815, 8232.59] - - [6784, 256, 1, 128] - - [660, 3548.92] + - [796, 3548.92] - - [5888, 704, 1, 128] - - [667, 3959.65] + - [803, 3959.65] - - [6784, 64, 1, 128] - - [595, 2150.82] + - [731, 2150.82] - - [4, 448, 1, 1280] - - [720, 268.063] + - [856, 268.063] - - [1024, 4288, 1, 1280] - - [694, 8363.72] + - [830, 8363.72] - - [2368, 5056, 1, 3328] - - [693, 8581.85] + - [829, 8581.85] - - [448, 4, 1, 128] - - [714, 16.8673] + - [850, 16.8673] - - [4, 256, 1, 3328] - - [723, 201.988] + - [859, 201.988] - - [4288, 1024, 1, 3328] - - [689, 8567.72] + - [825, 8567.72] - - [6144, 48000, 1, 2560] - - [697, 3751.68] + - [833, 3751.68] - - [1024, 5056, 1, 3328] - - [676, 9440.66] + - [812, 9440.66] - - [1024, 1856, 1, 3328] - - [697, 8244.36] + - [833, 8244.36] - - [704, 704, 1, 1280] - - [689, 5529.99] + - [825, 5529.99] - - [128, 2368, 1, 1280] - - [646, 5062.38] + - [782, 5062.38] - - [3584, 4, 1, 128] - - [715, 61.5949] + - [851, 61.5949] - - [3584, 256, 1, 1280] - - [713, 6260.24] + - [849, 6260.24] - - [4, 128, 1, 128] - - [714, 1.2587] + - [850, 1.2587] - - [128, 4288, 1, 3328] - - [625, 6186.15] + - [761, 6186.15] - - [5124, 1500, 1, 2560] - - [693, 8432.62] + - [829, 8432.62] - - [3584, 128, 1, 1280] - - [679, 6547.85] + - [815, 6547.85] - - [4, 256, 1, 1280] - - [632, 180.144] + - [768, 180.144] - - [128, 704, 1, 3328] - - [604, 5177.81] + - [740, 5177.81] - - [4288, 6784, 1, 256] - - [677, 9005.34] + - [813, 9005.34] - - [3584, 2944, 1, 3328] - - [694, 8872.27] + - [830, 8872.27] - - [128, 1856, 1, 256] - - [679, 3690.48] + - [815, 3690.48] - - [64, 4288, 1, 256] - - [679, 3007.57] + - [815, 3007.57] - - [4, 3584, 1, 3328] - - [609, 639.99] + - [745, 639.99] - - [64, 4, 1, 3328] - - [723, 98.7074] + - [859, 98.7074] - - [4, 64, 1, 3328] - - [723, 91.9069] + - [859, 91.9069] - - [35, 700, 1, 2560] - - [577, 2397.65] + - [713, 2397.65] - - [5888, 2944, 1, 256] - - [687, 9031.28] + - [823, 9031.28] - - [4, 2368, 1, 256] - - [627, 256.968] + - [763, 256.968] - - [1856, 64, 1, 256] - - [611, 2222.96] + - [747, 2222.96] - - [5056, 128, 1, 1280] - - [679, 6557.85] + - [815, 6557.85] - - [448, 4288, 1, 1280] - - [703, 6891.66] + - [839, 6891.66] - - [256, 4288, 1, 256] - - [679, 6250.51] + - [815, 6250.51] - - [1024, 4288, 1, 128] - - [663, 3951.41] + - [799, 3951.41] - - [4, 1024, 1, 256] - - [627, 182.144] + - [763, 182.144] - - [5056, 4288, 1, 256] - - [683, 8933.43] + - [819, 8933.43] - - [1024, 448, 1, 256] - - [689, 4573.33] + - [825, 4573.33] - - [1024, 3584, 1, 256] - - [684, 7447.18] + - [820, 7447.18] - - [2944, 128, 1, 1280] - - [689, 5417.27] + - [825, 5417.27] - - [49, 2048, 64, 512] - - [732, 5916.91] + - [868, 5916.91] - - [2560, 32, 1, 2560] - - [626, 4076.99] + - [762, 4076.99] - - [64, 256, 1, 256] - - [643, 689.953] + - [779, 689.953] - - [1024, 4, 1, 512] - - [635, 288.17] + - [771, 288.17] - - [128, 2368, 1, 128] - - [589, 1809.68] + - [725, 1809.68] - - [256, 704, 1, 1280] - - [679, 4033.08] + - [815, 4033.08] - - [64, 2368, 1, 128] - - [580, 1165.88] + - [716, 1165.88] - - [176, 1500, 1, 1408] - - [607, 4922.13] + - [743, 4922.13] - - [448, 5888, 1, 1280] - - [689, 7550.21] + - [825, 7550.21] - - [512, 3000, 1, 2048] - - [711, 6562.44] + - [847, 6562.44] - - [5056, 448, 1, 128] - - [661, 3947.97] + - [797, 3947.97] - - [4288, 704, 1, 1280] - - [679, 8243.82] + - [815, 8243.82] - - [3584, 2944, 1, 128] - - [671, 4284.88] + - [807, 4284.88] - - [6784, 256, 1, 1280] - - [679, 7955.21] + - [815, 7955.21] - - [256, 2944, 1, 1280] - - [709, 6691.9] + - [845, 6691.9] - - [2560, 128, 1, 2560] - - [647, 5347.23] + - [783, 5347.23] - - [2368, 5888, 1, 3328] - - [684, 8919.07] + - [820, 8919.07] - - [4, 64, 1, 256] - - [632, 13.1032] + - [768, 13.1032] - - [704, 1024, 1, 3328] - - [709, 6648.12] + - [845, 6648.12] - - [2368, 1856, 1, 1280] - - [695, 8016.51] + - [831, 8016.51] - - [448, 5056, 1, 3328] - - [679, 8231.73] + - [815, 8231.73] - - [128, 448, 1, 128] - - [588, 441.208] + - [724, 441.208] - - [128, 6784, 1, 256] - - [689, 5850.05] + - [825, 5850.05] - - [512, 4, 1, 500000] - - [569, 1027.14] + - [705, 1027.14] - - [3584, 4288, 1, 128] - - [665, 4260.9] + - [801, 4260.9] - - [64, 448, 1, 128] - - [588, 253.554] + - [724, 253.554] - - [1024, 6000, 1, 2816] - - [693, 8886.14] + - [829, 8886.14] - - [5888, 4288, 1, 3328] - - [693, 8968.16] + - [829, 8968.16] - - [2368, 704, 1, 256] - - [709, 4663.24] + - [845, 4663.24] - - [256, 1856, 1, 3328] - - [681, 6480.63] + - [817, 6480.63] - - [1856, 128, 1, 256] - - [679, 3726.66] + - [815, 3726.66] - - [6784, 128, 1, 128] - - [582, 2824.01] + - [718, 2824.01] - - [3584, 1408, 1, 128] - - [665, 3666.78] + - [801, 3666.78] - - [1856, 5056, 1, 1280] - - [676, 8651.36] + - [812, 8651.36] - - [2944, 1024, 1, 1280] - - [687, 8765.21] + - [823, 8765.21] - - [5056, 4, 1, 256] - - [601, 428.688] + - [737, 428.688] - - [3584, 5888, 1, 3328] - - [687, 9347.75] + - [823, 9347.75] - - [2368, 4288, 1, 256] - - [697, 8013.1] + - [833, 8013.1] - - [1024, 2368, 1, 3328] - - [684, 8119.29] + - [820, 8119.29] - - [128, 3584, 1, 128] - - [584, 2584.62] + - [720, 2584.62] - - [704, 1408, 1, 256] - - [689, 6792.27] + - [825, 6792.27] - - [4096, 128, 1, 4096] - - [711, 6624.84] + - [847, 6624.84] - - [1024, 2944, 1, 128] - - [663, 3771.37] + - [799, 3771.37] - - [1024, 3584, 1, 1280] - - [684, 8952.71] + - [820, 8952.71] - - [4288, 5888, 1, 3328] - - [697, 9048.05] + - [833, 9048.05] - - [4288, 4, 1, 3328] - - [602, 615.206] + - [738, 615.206] - - [4608, 16, 1, 1536] - - [606, 2894.94] + - [742, 2894.94] - - [5888, 64, 1, 128] - - [593, 1827.16] + - [729, 1827.16] - - [4, 5888, 1, 128] - - [714, 179.544] + - [850, 179.544] - - [1024, 2944, 1, 3328] - - [685, 8298.77] + - [821, 8298.77] - - [2048, 64, 1, 2048] - - [614, 4963.77] + - [750, 4963.77] - - [6144, 2, 1, 2560] - - [603, 477.88] + - [739, 477.88] - - [256, 6784, 1, 1280] - - [677, 7491.94] + - [813, 7491.94] - - [1856, 3584, 1, 256] - - [689, 7580.6] + - [825, 7580.6] - - [128, 448, 1, 3328] - - [640, 4417.71] + - [776, 4417.71] - - [6784, 1856, 1, 128] - - [668, 4621.74] + - [804, 4621.74] - - [1024, 1500, 1, 2048] - - [689, 6284.5] + - [825, 6284.5] - - [5056, 128, 1, 256] - - [689, 5705.16] + - [825, 5705.16] - - [512, 24000, 1, 2816] - - [676, 8919.85] + - [812, 8919.85] - - [256, 5888, 1, 1280] - - [691, 7978.0] + - [827, 7978.0] - - [4, 128, 1, 1280] - - [632, 94.2609] + - [768, 94.2609] - - [4288, 6784, 1, 3328] - - [697, 9012.58] + - [833, 9012.58] - - [6784, 128, 1, 1280] - - [681, 6807.35] + - [817, 6807.35] - - [64, 1408, 1, 256] - - [610, 2045.19] + - [746, 2045.19] - - [2368, 1408, 1, 128] - - [661, 4340.73] + - [797, 4340.73] - - [1856, 448, 1, 256] - - [710, 3639.99] + - [846, 3639.99] - - [1408, 1024, 1, 128] - - [669, 3417.68] + - [805, 3417.68] - - [128, 64, 1, 128] - - [590, 68.7241] + - [726, 68.7241] - - [6784, 3584, 1, 3328] - - [687, 9425.63] + - [823, 9425.63] - - [1760, 7000, 1, 1760] - - [684, 8780.41] + - [820, 8780.41] - - [1024, 704, 1, 3328] - - [701, 5644.6] + - [837, 5644.6] - - [64, 64, 1, 128] - - [580, 38.2023] + - [716, 38.2023] - - [2368, 5056, 1, 1280] - - [698, 8462.41] + - [834, 8462.41] - - [64, 4, 1, 1280] - - [632, 46.6455] + - [768, 46.6455] - - [1408, 2368, 1, 1280] - - [684, 8235.08] + - [820, 8235.08] - - [128, 1408, 1, 1280] - - [646, 4491.66] + - [782, 4491.66] - - [1024, 1, 1, 512] - - [650, 82.02] + - [786, 82.02] - - [4, 1408, 1, 128] - - [714, 56.42] + - [850, 56.42] - - [704, 4288, 1, 128] - - [668, 3942.96] + - [804, 3942.96] - - [128, 1856, 1, 3328] - - [634, 6111.93] + - [770, 6111.93] - - [2944, 2944, 1, 256] - - [693, 8640.22] + - [829, 8640.22] - - [2944, 4, 1, 1280] - - [627, 554.265] + - [763, 554.265] - - [5888, 4, 1, 256] - - [609, 435.744] + - [745, 435.744] - - [6784, 256, 1, 256] - - [689, 7025.96] + - [825, 7025.96] - - [256, 5056, 1, 3328] - - [689, 8249.57] + - [825, 8249.57] - - [128, 4288, 1, 1280] - - [679, 5561.74] + - [815, 5561.74] - - [5056, 1856, 1, 128] - - [673, 3975.28] + - [809, 3975.28] - - [1024, 3000, 1, 1536] - - [694, 8544.54] + - [830, 8544.54] - - [5056, 1024, 1, 3328] - - [687, 9361.47] + - [823, 9361.47] - - [128, 128, 1, 256] - - [639, 699.151] + - [775, 699.151] - - [1760, 64, 1, 1760] - - [607, 4956.26] + - [743, 4956.26] - - [4288, 3584, 1, 3328] - - [707, 7506.18] + - [843, 7506.18] - - [448, 704, 1, 3328] - - [679, 4697.66] + - [815, 4697.66] - - [448, 448, 1, 128] - - [596, 1249.62] + - [732, 1249.62] - - [1024, 2368, 1, 1280] - - [689, 7756.44] + - [825, 7756.44] - - [1856, 704, 1, 3328] - - [689, 8340.66] + - [825, 8340.66] - - [512, 1500, 1, 2560] - - [691, 6041.39] + - [827, 6041.39] - - [5888, 6784, 1, 3328] - - [687, 9199.38] + - [823, 9199.38] - - [704, 4288, 1, 1280] - - [681, 8342.06] + - [817, 8342.06] - - [128, 50176, 1, 512] - - [727, 7589.48] + - [863, 7589.48] - - [704, 256, 1, 256] - - [679, 2912.81] + - [815, 2912.81] - - [1024, 48000, 1, 2048] - - [684, 8947.42] + - [820, 8947.42] - - [4288, 1024, 1, 128] - - [660, 4291.75] + - [796, 4291.75] - - [3136, 64, 128, 64] - - [742, 8175.16] - - - [784, 512, 64, 128] - - [740, 8378.44] - - - [3136, 256, 64, 64] - - [743, 8506.75] - - - [12544, 1024, 1, 256] - - [736, 8928.03] + - [878, 8175.16] - - [784, 128, 128, 512] - - [741, 8190.63] + - [877, 8190.63] - - [784, 512, 256, 128] - - [739, 8637.24] - - - [3136, 64, 64, 256] - - [738, 8783.03] - - - [3136, 512, 1, 2048] - - [735, 7298.42] - - - [12544, 256, 1, 1024] - - [747, 7667.35] - - - [3136, 2048, 1, 512] - - [746, 8447.32] + - [875, 8637.24] - - [3136, 256, 256, 64] - - [739, 8663.18] + - [875, 8663.18] - - [3136, 64, 128, 256] - - [737, 8943.56] - - - [784, 128, 64, 512] - - [745, 8006.37] + - [873, 8943.56] - - [3136, 64, 256, 64] - - [742, 8267.22] + - [878, 8267.22] - - [784, 512, 128, 128] - - [739, 8564.35] - - - [3136, 64, 64, 64] - - [742, 8009.45] + - [875, 8564.35] - - [784, 128, 256, 512] - - [743, 8377.16] + - [879, 8377.16] - - [3136, 64, 256, 256] - - [744, 9033.98] + - [880, 9033.98] - - [3136, 256, 128, 64] - - [739, 8624.56] + - [875, 8624.56] - - [1024, 256, 1, 1024] - - [765, 6331.13] + - [901, 6331.13] - - [1024, 512, 1, 2048] - - [764, 8100.14] + - [900, 8100.14] - - [512, 200, 1, 512] - - [773, 2861.93] + - [909, 2861.93] - - [4096, 256, 1, 2048] - - [756, 8812.82] + - [892, 8812.82] - - [4096, 512, 1, 1024] - - [766, 9068.87] + - [902, 9068.87] - - [1024, 200, 1, 1024] - - [765, 5110.12] + - [901, 5110.12] - - [1024, 512, 1, 1024] - - [758, 7785.35] + - [894, 7785.35] - - [2048, 256, 1, 4096] - - [768, 8438.81] + - [904, 8438.81] - - [2048, 768, 1, 512] - - [750, 8618.53] + - [886, 8618.53] - - [512, 256, 1, 1024] - - [770, 4835.03] + - [906, 4835.03] - - [512, 768, 1, 2048] - - [767, 6909.04] + - [903, 6909.04] - - [2048, 256, 1, 1024] - - [763, 7941.98] + - [899, 7941.98] - - [1024, 256, 1, 2048] - - [760, 6997.9] + - [896, 6997.9] - - [2048, 200, 1, 512] - - [763, 5649.76] + - [899, 5649.76] - - [4096, 200, 1, 1024] - - [761, 6678.93] + - [897, 6678.93] - - [2048, 200, 1, 4096] - - [769, 6706.69] + - [905, 6706.69] - - [2048, 512, 1, 1024] - - [766, 8549.0] + - [902, 8549.0] - - [1024, 1024, 1, 512] - - [761, 8046.73] + - [897, 8046.73] - - [1024, 200, 1, 4096] - - [760, 5884.36] + - [896, 5884.36] - - [2048, 512, 1, 4096] - - [771, 8995.94] + - [907, 8995.94] - - [4096, 512, 1, 2048] - - [766, 9298.18] - - - [512, 256, 1, 2048] - - [759, 5186.26] + - [902, 9298.18] - - [4096, 1024, 1, 2048] - - [748, 9790.77] + - [884, 9790.77] - - [2048, 1024, 1, 2048] - - [749, 9278.9] + - [885, 9278.9] - - [1024, 200, 1, 512] - - [765, 4535.46] + - [901, 4535.46] - - [1024, 1024, 1, 4096] - - [756, 8967.39] + - [892, 8967.39] - - [2048, 1024, 1, 4096] - - [751, 9500.56] + - [887, 9500.56] - - [4096, 200, 1, 2048] - - [757, 7082.68] + - [893, 7082.68] - - [2048, 200, 1, 1024] - - [763, 6212.04] + - [899, 6212.04] - - [1024, 768, 1, 512] - - [764, 7401.81] + - [900, 7401.81] - - [2048, 512, 1, 512] - - [761, 8124.66] + - [897, 8124.66] - - [2048, 200, 1, 2048] - - [763, 6561.9] + - [899, 6561.9] - - [2048, 256, 1, 2048] - - [764, 8224.23] + - [900, 8224.23] - - [512, 768, 1, 512] - - [762, 6469.46] + - [898, 6469.46] - - [512, 200, 1, 1024] - - [765, 3755.74] + - [901, 3755.74] - - [4096, 1024, 1, 1024] - - [748, 9605.95] + - [884, 9605.95] - - [4096, 256, 1, 4096] - - [771, 8961.39] + - [907, 8961.39] - - [1024, 512, 1, 512] - - [764, 7109.09] + - [900, 7109.09] - - [512, 256, 1, 512] - - [772, 4033.08] + - [908, 4033.08] - - [1024, 256, 1, 4096] - - [760, 7326.4] - - - [4096, 512, 1, 4096] - - [752, 9472.07] + - [896, 7326.4] - - [1024, 200, 1, 2048] - - [753, 5530.56] + - [889, 5530.56] - - [2048, 1024, 1, 512] - - [754, 8995.93] + - [890, 8995.93] - - [1024, 1024, 1, 2048] - - [761, 8830.21] + - [897, 8830.21] - - [4096, 256, 1, 1024] - - [761, 8581.8] + - [897, 8581.8] - - [512, 768, 1, 1024] - - [762, 6876.01] + - [898, 6876.01] - - [1024, 512, 1, 4096] - - [758, 8484.15] + - [894, 8484.15] - - [1024, 256, 1, 512] - - [755, 5668.08] + - [891, 5668.08] - - [4096, 200, 1, 4096] - - [768, 7018.69] + - [904, 7018.69] - - [2048, 256, 1, 512] - - [768, 7079.09] + - [904, 7079.09] - - [512, 200, 1, 2048] - - [773, 4283.5] + - [909, 4283.5] - - [1024, 1024, 1, 1024] - - [756, 8565.37] + - [892, 8565.37] - - [2048, 512, 1, 2048] - - [756, 8850.59] + - [892, 8850.59] - - [4096, 1024, 1, 4096] - - [749, 9843.28] + - [885, 9843.28] - - [2048, 1024, 1, 1024] - - [754, 9234.21] + - [890, 9234.21] - - [4096, 384, 1, 2048] - - [796, 8892.62] + - [932, 8892.62] - - [4096, 192, 1, 2048] - - [790, 8024.28] + - [926, 8024.28] - - [289, 160, 64, 768] - - [792, 6783.73] + - [928, 6783.73] - - [1225, 192, 64, 384] - - [779, 9373.93] + - [915, 9373.93] - - [5329, 64, 64, 160] - - [783, 9186.79] + - [919, 9186.79] - - [1225, 64, 64, 288] - - [774, 8492.51] + - [910, 8492.51] - - [1225, 64, 64, 384] - - [778, 8735.86] + - [914, 8735.86] - - [289, 128, 64, 1024] - - [793, 7000.3] + - [929, 7000.3] - - [4096, 320, 1, 1280] - - [798, 8302.36] + - [934, 8302.36] - - [4096, 384, 1, 1536] - - [780, 9052.55] + - [916, 9052.55] - - [4096, 192, 1, 1280] - - [795, 7561.95] + - [931, 7561.95] - - [289, 192, 64, 768] - - [791, 7882.6] + - [927, 7882.6] - - [1225, 48, 64, 256] - - [782, 6620.35] + - [918, 6620.35] - - [289, 192, 64, 1024] - - [789, 7347.09] + - [925, 7347.09] - - [1225, 64, 64, 192] - - [775, 8098.45] + - [911, 8098.45] - - [1225, 96, 64, 384] - - [776, 8303.18] + - [912, 8303.18] - - [1225, 48, 64, 288] - - [784, 6746.87] + - [920, 6746.87] - - [4096, 320, 1, 2048] - - [785, 8384.52] + - [921, 8384.52] - - [4096, 256, 1, 1536] - - [797, 8734.44] + - [933, 8734.44] - - [1225, 48, 64, 192] - - [784, 6516.46] + - [920, 6516.46] - - [4096, 384, 1, 1280] - - [794, 9023.34] + - [930, 9023.34] - - [1225, 64, 64, 256] - - [781, 8319.44] + - [917, 8319.44] - - [4096, 448, 1, 1280] - - [785, 8343.42] + - [921, 8343.42] - - [289, 128, 64, 768] - - [787, 7668.08] + - [923, 7668.08] - - [289, 256, 64, 1024] - - [788, 7535.56] + - [924, 7535.56] - - [4096, 448, 1, 2048] - - [785, 8572.41] + - [921, 8572.41] - - [5329, 80, 64, 64] - - [784, 6492.54] + - [920, 6492.54] - - [1225, 32, 64, 192] - - [777, 6278.64] + - [913, 6278.64] - - [289, 384, 64, 1024] - - [786, 7767.67] + - [922, 7767.67] - - [1024, 3594, 1, 4096] - - [805, 8661.52] + - [941, 8661.52] - - [4096, 3103, 1, 1024] - - [815, 9652.23] + - [951, 9652.23] - - [4096, 3136, 1, 1024] - - [799, 9723.15] + - [935, 9723.15] - - [1024, 3141, 1, 4096] - - [817, 8612.12] + - [953, 8612.12] - - [64, 147, 432, 148] - - [832, 6372.03] + - [968, 6372.03] - - [4096, 3559, 1, 1024] - - [804, 9906.35] + - [940, 9906.35] - - [4096, 3368, 1, 1024] - - [799, 9721.01] + - [935, 9721.01] - - [1024, 3335, 1, 4096] - - [823, 8990.29] + - [959, 8990.29] - - [1024, 3510, 1, 4096] - - [823, 9440.68] + - [959, 9440.68] - - [4096, 3209, 1, 1024] - - [804, 9632.76] + - [940, 9632.76] - - [4096, 3322, 1, 1024] - - [803, 9939.52] + - [939, 9939.52] - - [1024, 3400, 1, 4096] - - [822, 9156.09] + - [958, 9156.09] - - [1024, 3995, 1, 4096] - - [805, 9610.25] + - [941, 9610.25] - - [1024, 3503, 1, 4096] - - [823, 9446.57] + - [959, 9446.57] - - [4096, 3594, 1, 1024] - - [814, 9691.96] + - [950, 9691.96] - - [4096, 3473, 1, 1024] - - [803, 9698.9] + - [939, 9698.9] - - [4096, 3522, 1, 1024] - - [804, 9816.92] + - [940, 9816.92] - - [1024, 3103, 1, 4096] - - [801, 8491.05] + - [937, 8491.05] - - [1024, 3214, 1, 4096] - - [822, 8667.67] + - [958, 8667.67] - - [4096, 3449, 1, 1024] - - [814, 9795.71] + - [950, 9795.71] - - [1024, 3136, 1, 4096] - - [823, 8500.61] + - [959, 8500.61] - - [1024, 3955, 1, 33708] - - [803, 9634.94] + - [939, 9634.94] - - [1024, 3780, 1, 4096] - - [806, 9088.88] + - [942, 9088.88] - - [1024, 3906, 1, 33708] - - [804, 9515.46] + - [940, 9515.46] - - [1024, 3386, 1, 4096] - - [823, 9116.05] + - [959, 9116.05] - - [4096, 3396, 1, 1024] - - [814, 9665.6] + - [950, 9665.6] - - [1024, 3183, 1, 4096] - - [801, 8662.94] + - [937, 8662.94] - - [1024, 3098, 1, 4096] - - [817, 8490.22] + - [953, 8490.22] - - [1024, 3548, 1, 4096] - - [823, 9555.63] + - [959, 9555.63] - - [1024, 3224, 1, 4096] - - [816, 8760.88] + - [952, 8760.88] - - [4096, 3469, 1, 1024] - - [803, 9687.21] + - [939, 9687.21] - - [1024, 3582, 1, 4096] - - [820, 9691.0] + - [956, 9691.0] - - [1024, 2977, 1, 4096] - - [805, 9379.38] + - [941, 9379.38] - - [1024, 3939, 1, 1024] - - [802, 9172.11] + - [938, 9172.11] - - [64, 123, 528, 123] - - [850, 6346.17] + - [986, 6346.17] - - [64, 12, 5040, 12] - - [827, 1536.1] + - [963, 1536.1] - - [4096, 3176, 1, 1024] - - [815, 9712.2] + - [951, 9712.2] - - [1024, 3559, 1, 4096] - - [819, 9579.84] + - [955, 9579.84] - - [1024, 3478, 1, 4096] - - [823, 9373.85] + - [959, 9373.85] - - [4096, 3343, 1, 1024] - - [799, 9638.77] + - [935, 9638.77] - - [4096, 3440, 1, 1024] - - [799, 9853.96] + - [935, 9853.96] - - [1024, 3996, 1, 33708] - - [803, 9733.55] + - [939, 9733.55] - - [1024, 4012, 1, 4096] - - [804, 9636.99] + - [940, 9636.99] - - [1024, 3322, 1, 4096] - - [823, 8945.12] + - [959, 8945.12] - - [1024, 3990, 1, 33708] - - [803, 9720.31] + - [939, 9720.31] - - [1024, 3314, 1, 4096] - - [823, 8944.72] + - [959, 8944.72] - - [4096, 3513, 1, 1024] - - [803, 9794.95] + - [939, 9794.95] - - [1024, 3562, 1, 4096] - - [823, 9597.28] + - [959, 9597.28] - - [1024, 3443, 1, 4096] - - [823, 9279.52] + - [959, 9279.52] - - [1024, 3554, 1, 4096] - - [820, 9552.16] + - [956, 9552.16] - - [1024, 3063, 1, 4096] - - [805, 9622.58] + - [941, 9622.58] - - [64, 111, 576, 112] - - [850, 6274.65] + - [986, 6274.65] - - [4096, 3460, 1, 1024] - - [803, 9665.69] + - [939, 9665.69] - - [1024, 3209, 1, 4096] - - [802, 8708.39] + - [938, 8708.39] - - [1024, 3147, 1, 4096] - - [823, 8492.23] + - [959, 8492.23] - - [4096, 3387, 1, 1024] - - [800, 9761.34] + - [936, 9761.34] - - [4096, 3436, 1, 1024] - - [799, 9815.15] + - [935, 9815.15] - - [1024, 3341, 1, 4096] - - [822, 9005.07] + - [958, 9005.07] - - [1024, 3516, 1, 4096] - - [822, 9471.39] + - [958, 9471.39] - - [4096, 3277, 1, 1024] - - [803, 9807.12] + - [939, 9807.12] - - [1024, 3454, 1, 4096] - - [823, 9301.03] + - [959, 9301.03] - - [1024, 3969, 1, 4096] - - [803, 9539.82] + - [939, 9539.82] - - [1024, 3999, 1, 4096] - - [804, 9607.52] + - [940, 9607.52] - - [1024, 4032, 1, 4096] - - [805, 9693.47] + - [941, 9693.47] - - [4096, 3541, 1, 1024] - - [804, 9866.73] + - [940, 9866.73] - - [4096, 3334, 1, 1024] - - [815, 9614.41] + - [951, 9614.41] - - [1024, 3365, 1, 4096] - - [823, 9058.58] + - [959, 9058.58] - - [1024, 3527, 1, 4096] - - [823, 9510.31] + - [959, 9510.31] - - [1024, 3190, 1, 4096] - - [822, 8627.8] + - [958, 8627.8] - - [4096, 3906, 1, 1024] - - [800, 9817.78] + - [936, 9817.78] - - [1024, 3593, 1, 4096] - - [805, 8663.09] + - [941, 8663.09] - - [1024, 3336, 1, 4096] - - [823, 8991.13] + - [959, 8991.13] - - [4096, 3504, 1, 1024] - - [803, 9769.86] + - [939, 9769.86] - - [4096, 3977, 1, 1024] - - [804, 9742.62] + - [940, 9742.62] - - [1024, 3906, 1, 4096] - - [804, 9386.25] + - [940, 9386.25] - - [4096, 3415, 1, 1024] - - [814, 9802.7] + - [950, 9802.7] - - [1024, 3295, 1, 4096] - - [822, 8879.26] + - [958, 8879.26] - - [4096, 3321, 1, 1024] - - [804, 9931.43] + - [940, 9931.43] - - [1024, 3072, 1, 4096] - - [805, 9671.71] + - [941, 9671.71] - - [1024, 3408, 1, 4096] - - [822, 9182.83] + - [958, 9182.83] - - [1024, 3522, 1, 4096] - - [823, 9484.63] + - [959, 9484.63] - - [4096, 3751, 1, 1024] - - [804, 9778.86] + - [940, 9778.86] - - [4096, 3378, 1, 1024] - - [814, 9692.77] + - [950, 9692.77] - - [64, 77, 816, 77] - - [856, 4850.29] + - [992, 4850.29] - - [1024, 3925, 1, 33708] - - [803, 9560.88] + - [939, 9560.88] - - [1024, 3990, 1, 1024] - - [805, 9272.75] + - [941, 9272.75] - - [1024, 3290, 1, 4096] - - [816, 8905.61] + - [952, 8905.61] - - [4096, 3500, 1, 1024] - - [804, 9761.82] + - [940, 9761.82] - - [4096, 3565, 1, 1024] - - [803, 9919.37] + - [939, 9919.37] - - [1024, 3484, 1, 4096] - - [822, 9376.52] + - [958, 9376.52] - - [4096, 3395, 1, 1024] - - [815, 9788.16] + - [951, 9788.16] - - [64, 92, 688, 92] - - [842, 5606.1] + - [978, 5606.1] - - [1024, 3681, 1, 1024] - - [807, 8690.23] + - [943, 8690.23] - - [64, 159, 400, 159] - - [834, 6518.97] + - [970, 6518.97] - - [1024, 3584, 1, 1024] - - [822, 9365.37] + - [958, 9365.37] - - [4096, 3093, 1, 1024] - - [814, 9623.41] + - [950, 9623.41] - - [1024, 4050, 1, 1024] - - [806, 9354.14] + - [942, 9354.14] - - [1024, 3301, 1, 4096] - - [823, 8889.04] + - [959, 8889.04] - - [1024, 3581, 1, 4096] - - [822, 9673.82] + - [958, 9673.82] - - [4096, 3374, 1, 1024] - - [815, 9707.33] + - [951, 9707.33] - - [1024, 3449, 1, 4096] - - [823, 9270.9] + - [959, 9270.9] - - [4096, 3215, 1, 1024] - - [804, 9645.25] + - [940, 9645.25] - - [4096, 3312, 1, 1024] - - [804, 9888.72] + - [940, 9888.72] - - [4096, 3479, 1, 1024] - - [804, 9698.61] + - [940, 9698.61] - - [4096, 3544, 1, 1024] - - [804, 9875.09] + - [940, 9875.09] - - [1024, 3263, 1, 4096] - - [823, 8787.61] + - [959, 8787.61] - - [4096, 3455, 1, 1024] - - [814, 9845.29] + - [950, 9845.29] - - [1024, 3379, 1, 4096] - - [820, 9100.01] + - [956, 9100.01] - - [1024, 3490, 1, 4096] - - [823, 9397.49] + - [959, 9397.49] - - [1024, 3368, 1, 4096] - - [823, 9079.25] + - [959, 9079.25] - - [4096, 3186, 1, 1024] - - [799, 9750.17] + - [935, 9750.17] - - [1024, 3428, 1, 4096] - - [823, 9232.92] + - [959, 9232.92] - - [64, 85, 752, 84] - - [838, 5342.67] + - [974, 5342.67] - - [4096, 3561, 1, 1024] - - [804, 9914.02] + - [940, 9914.02] - - [4096, 3418, 1, 1024] - - [814, 9765.86] + - [950, 9765.86] - - [1024, 3064, 1, 4096] - - [805, 9621.68] + - [941, 9621.68] - - [4096, 3259, 1, 1024] - - [804, 9765.52] + - [940, 9765.52] - - [4096, 3308, 1, 1024] - - [803, 9900.46] + - [939, 9900.46] - - [1024, 3533, 1, 4096] - - [823, 9520.12] + - [959, 9520.12] - - [1024, 3344, 1, 4096] - - [823, 9014.55] + - [959, 9014.55] - - [1024, 4030, 1, 1024] - - [805, 9354.1] + - [941, 9354.1] - - [4096, 3459, 1, 1024] - - [804, 9656.2] + - [940, 9656.2] - - [1024, 3572, 1, 4096] - - [820, 9640.07] + - [956, 9640.07] - - [1024, 3925, 1, 1024] - - [816, 9173.74] + - [952, 9173.74] - - [4096, 3435, 1, 1024] - - [799, 9778.2] + - [935, 9778.2] - - [1024, 3956, 1, 4096] - - [806, 9498.56] + - [942, 9498.56] - - [1024, 3463, 1, 4096] - - [823, 9332.46] + - [959, 9332.46] - - [4096, 3182, 1, 1024] - - [814, 9826.84] + - [950, 9826.84] - - [4096, 3976, 1, 1024] - - [814, 9741.99] + - [950, 9741.99] - - [1024, 3417, 1, 4096] - - [823, 9208.97] + - [959, 9208.97] - - [1024, 3528, 1, 4096] - - [823, 9509.09] + - [959, 9509.09] - - [4096, 3446, 1, 1024] - - [814, 9816.97] + - [950, 9816.97] - - [64, 122, 528, 123] - - [850, 6325.98] + - [986, 6325.98] - - [1024, 3543, 1, 4096] - - [823, 9538.73] + - [959, 9538.73] - - [4096, 3287, 1, 1024] - - [803, 9846.04] + - [939, 9846.04] - - [1024, 3499, 1, 4096] - - [823, 9428.51] + - [959, 9428.51] - - [1024, 3231, 1, 4096] - - [816, 8769.91] + - [952, 8769.91] - - [64, 17, 3632, 17] - - [838, 1934.94] + - [974, 1934.94] - - [4096, 3519, 1, 1024] - - [803, 9804.38] + - [939, 9804.38] - - [4096, 3552, 1, 1024] - - [803, 9892.65] + - [939, 9892.65] - - [1024, 3458, 1, 4096] - - [823, 9312.28] + - [959, 9312.28] - - [64, 93, 688, 92] - - [842, 5660.22] + - [978, 5660.22] - - [1024, 3374, 1, 4096] - - [817, 9110.41] + - [953, 9110.41] - - [1024, 3396, 1, 4096] - - [823, 9145.79] + - [959, 9145.79] - - [1024, 2967, 1, 4096] - - [805, 9364.76] + - [941, 9364.76] - - [64, 19, 3264, 19] - - [842, 2142.47] + - [978, 2142.47] - - [4096, 3482, 1, 1024] - - [803, 9714.2] + - [939, 9714.2] - - [64, 32, 1984, 32] - - [853, 3619.91] + - [989, 3619.91] - - [64, 102, 624, 99] - - [844, 5515.33] + - [980, 5515.33] - - [1024, 3226, 1, 4096] - - [802, 8790.47] + - [938, 8790.47] - - [4096, 3377, 1, 1024] - - [800, 9684.08] + - [936, 9684.08] - - [4096, 3426, 1, 1024] - - [815, 9869.94] + - [951, 9869.94] - - [4096, 2935, 1, 1024] - - [815, 9762.11] + - [951, 9762.11] - - [64, 133, 480, 133] - - [854, 5891.32] + - [990, 5891.32] - - [1024, 3439, 1, 4096] - - [823, 9253.99] + - [959, 9253.99] - - [4096, 3267, 1, 1024] - - [803, 9783.9] + - [939, 9783.9] - - [4096, 3499, 1, 1024] - - [804, 9761.11] + - [940, 9761.11] - - [4096, 3356, 1, 1024] - - [815, 9679.44] + - [951, 9679.44] - - [64, 232, 272, 232] - - [858, 7181.03] + - [994, 7181.03] - - [64, 162, 400, 159] - - [818, 6444.63] + - [954, 6444.63] - - [4096, 3939, 1, 1024] - - [814, 9878.0] + - [950, 9878.0] - - [1024, 3526, 1, 4096] - - [823, 9508.1] + - [959, 9508.1] - - [1024, 3859, 1, 33708] - - [804, 9402.13] + - [940, 9402.13] - - [1024, 3385, 1, 4096] - - [822, 9107.28] + - [958, 9107.28] - - [1024, 3496, 1, 4096] - - [823, 9418.0] + - [959, 9418.0] - - [4096, 3141, 1, 1024] - - [815, 9682.54] + - [951, 9682.54] - - [4096, 3510, 1, 1024] - - [803, 9786.59] + - [939, 9786.59] - - [1024, 3434, 1, 4096] - - [823, 9246.7] + - [959, 9246.7] - - [4096, 3969, 1, 1024] - - [803, 9714.85] + - [939, 9714.85] - - [1024, 3121, 1, 4096] - - [801, 8464.32] + - [937, 8464.32] - - [1024, 3232, 1, 4096] - - [823, 8711.73] + - [959, 8711.73] - - [1024, 4030, 1, 33708] - - [804, 9816.31] + - [940, 9816.31] - - [1024, 3780, 1, 33708] - - [812, 9315.54] + - [948, 9315.54] - - [1024, 3969, 1, 1024] - - [801, 9248.54] + - [937, 9248.54] - - [4096, 3527, 1, 1024] - - [803, 9832.94] + - [939, 9832.94] - - [4096, 3336, 1, 1024] - - [800, 9623.35] + - [936, 9623.35] - - [4096, 3290, 1, 1024] - - [803, 9852.21] + - [939, 9852.21] - - [64, 9, 6544, 9] - - [843, 1068.24] + - [979, 1068.24] - - [1024, 3469, 1, 4096] - - [823, 9350.55] + - [959, 9350.55] - - [4096, 3490, 1, 1024] - - [803, 9737.56] + - [939, 9737.56] - - [4096, 3064, 1, 1024] - - [803, 9890.02] + - [939, 9890.02] - - [4096, 3582, 1, 1024] - - [804, 9961.38] + - [940, 9961.38] - - [1024, 3956, 1, 1024] - - [801, 9294.25] + - [937, 9294.25] - - [4096, 3417, 1, 1024] - - [799, 9811.66] + - [935, 9811.66] - - [1024, 2736, 1, 4096] - - [805, 8636.7] + - [941, 8636.7] - - [64, 78, 816, 78] - - [842, 4946.1] + - [978, 4946.1] - - [1024, 3205, 1, 4096] - - [817, 8657.21] + - [953, 8657.21] - - [1024, 3143, 1, 4096] - - [817, 8567.87] + - [953, 8567.87] - - [1024, 4020, 1, 4096] - - [805, 9664.62] + - [941, 9664.62] - - [1024, 3318, 1, 4096] - - [802, 8967.05] + - [938, 8967.05] - - [4096, 3364, 1, 1024] - - [815, 9697.18] + - [951, 9697.18] - - [1024, 3353, 1, 4096] - - [823, 9034.17] + - [959, 9034.17] - - [1024, 3464, 1, 4096] - - [823, 9326.05] + - [959, 9326.05] - - [4096, 3205, 1, 1024] - - [803, 9619.1] + - [939, 9619.1] - - [4096, 3318, 1, 1024] - - [804, 9932.66] + - [940, 9932.66] - - [1024, 3402, 1, 4096] - - [822, 9153.49] + - [958, 9153.49] - - [4096, 3181, 1, 1024] - - [814, 9789.15] + - [950, 9789.15] - - [4096, 3550, 1, 1024] - - [804, 9888.13] + - [940, 9888.13] - - [4096, 3445, 1, 1024] - - [814, 9752.65] + - [950, 9752.65] - - [1024, 3138, 1, 4096] - - [800, 8484.1] + - [936, 8484.1] - - [64, 99, 624, 99] - - [850, 5323.99] + - [986, 5323.99] - - [4096, 3079, 1, 1024] - - [800, 9562.26] + - [936, 9562.26] - - [4096, 3144, 1, 1024] - - [814, 9686.66] + - [950, 9686.66] - - [4096, 3860, 1, 1024] - - [815, 9733.42] + - [951, 9733.42] - - [1024, 3515, 1, 4096] - - [823, 9478.44] + - [959, 9478.44] - - [4096, 3408, 1, 1024] - - [800, 9764.96] + - [936, 9764.96] - - [64, 101, 624, 102] - - [850, 5482.79] + - [986, 5482.79] - - [1024, 3181, 1, 4096] - - [802, 8593.26] + - [938, 8593.26] - - [4096, 3298, 1, 1024] - - [804, 9867.72] + - [940, 9867.72] - - [4096, 3585, 1, 1024] - - [814, 9633.01] + - [950, 9633.01] - - [1024, 3550, 1, 4096] - - [823, 9564.46] + - [959, 9564.46] - - [1024, 4020, 1, 1024] - - [806, 9339.15] + - [942, 9339.15] - - [4096, 3481, 1, 1024] - - [804, 9714.0] + - [940, 9714.0] - - [4096, 3530, 1, 1024] - - [804, 9833.99] + - [940, 9833.99] - - [4096, 3425, 1, 1024] - - [800, 9675.66] + - [936, 9675.66] - - [4096, 4026, 1, 1024] - - [804, 9849.77] + - [940, 9849.77] - - [1024, 3860, 1, 1024] - - [817, 9073.59] + - [953, 9073.59] - - [4096, 3975, 1, 1024] - - [804, 9737.72] + - [940, 9737.72] - - [1024, 3286, 1, 4096] - - [801, 8884.24] + - [937, 8884.24] - - [1024, 3176, 1, 4096] - - [801, 8597.48] + - [937, 8597.48] - - [1024, 3894, 1, 4096] - - [805, 9359.13] + - [941, 9359.13] - - [4096, 3355, 1, 1024] - - [814, 9693.09] + - [950, 9693.09] - - [4096, 3404, 1, 1024] - - [814, 9786.12] + - [950, 9786.12] - - [1024, 3501, 1, 4096] - - [822, 9426.14] + - [958, 9426.14] - - [4096, 3245, 1, 1024] - - [804, 9723.57] + - [940, 9723.57] - - [1024, 3431, 1, 4096] - - [820, 9244.32] + - [956, 9244.32] - - [1024, 4000, 1, 1024] - - [816, 9344.03] + - [952, 9344.03] - - [4096, 3509, 1, 1024] - - [803, 9781.72] + - [939, 9781.72] - - [4096, 3558, 1, 1024] - - [804, 9905.15] + - [940, 9905.15] - - [1024, 3535, 1, 4096] - - [822, 9519.15] + - [958, 9519.15] - - [1024, 3414, 1, 4096] - - [820, 9198.05] + - [956, 9198.05] - - [1024, 3445, 1, 4096] - - [823, 9279.66] + - [959, 9279.66] - - [1024, 3436, 1, 4096] - - [823, 9259.7] + - [959, 9259.7] - - [4096, 3472, 1, 1024] - - [804, 9685.27] + - [940, 9685.27] - - [1024, 3211, 1, 4096] - - [802, 8708.41] + - [938, 8708.41] - - [64, 7, 8192, 7] - - [839, 802.916] + - [975, 802.916] - - [4096, 3383, 1, 1024] - - [814, 9734.82] + - [950, 9734.82] - - [4096, 3448, 1, 1024] - - [815, 9828.54] + - [951, 9828.54] - - [1024, 3343, 1, 4096] - - [816, 9010.46] + - [952, 9010.46] - - [1024, 3518, 1, 4096] - - [823, 9468.02] + - [959, 9468.02] - - [4096, 3289, 1, 1024] - - [804, 9844.16] + - [940, 9844.16] - - [1024, 3440, 1, 4096] - - [819, 9269.52] + - [955, 9269.52] - - [1024, 4032, 1, 33708] - - [803, 9822.41] + - [939, 9822.41] - - [4096, 3489, 1, 1024] - - [803, 9742.03] + - [939, 9742.03] - - [4096, 3346, 1, 1024] - - [800, 9616.74] + - [936, 9616.74] - - [1024, 3534, 1, 4096] - - [822, 9524.29] + - [958, 9524.29] - - [1024, 3079, 1, 4096] - - [817, 8397.77] + - [953, 8397.77] - - [1024, 3955, 1, 4096] - - [804, 9492.25] + - [940, 9492.25] - - [4096, 3236, 1, 1024] - - [804, 9706.03] + - [940, 9706.03] - - [1024, 3545, 1, 4096] - - [822, 9551.97] + - [958, 9551.97] - - [1024, 3144, 1, 4096] - - [816, 8556.8] + - [952, 8556.8] - - [4096, 3780, 1, 1024] - - [803, 9847.6] + - [939, 9847.6] - - [4096, 3163, 1, 1024] - - [814, 9717.79] + - [950, 9717.79] - - [4096, 3468, 1, 1024] - - [804, 9686.49] + - [940, 9686.49] - - [1024, 3539, 1, 4096] - - [823, 9526.99] + - [959, 9526.99] - - [1024, 3541, 1, 4096] - - [823, 9532.86] + - [959, 9532.86] - - [4096, 3363, 1, 1024] - - [799, 9699.1] + - [935, 9699.1] - - [1024, 3475, 1, 4096] - - [823, 9357.1] + - [959, 9357.1] - - [4096, 3110, 1, 1024] - - [815, 9659.68] + - [951, 9659.68] - - [1024, 3509, 1, 4096] - - [822, 9450.59] + - [958, 9450.59] - - [1024, 3413, 1, 4096] - - [823, 9185.91] + - [959, 9185.91] - - [1024, 3975, 1, 1024] - - [801, 9315.52] + - [937, 9315.52] - - [4096, 3549, 1, 1024] - - [804, 9884.82] + - [940, 9884.82] - - [4096, 3342, 1, 1024] - - [814, 9644.37] + - [950, 9644.37] - - [1024, 2985, 1, 4096] - - [804, 9392.17] + - [940, 9392.17] - - [1024, 3876, 1, 33708] - - [803, 9442.32] + - [939, 9442.32] - - [4096, 3280, 1, 1024] - - [803, 9820.02] + - [939, 9820.02] - - [4096, 3191, 1, 1024] - - [815, 9862.18] + - [951, 9862.18] - - [4096, 3512, 1, 1024] - - [804, 9793.21] + - [940, 9793.21] - - [1024, 3560, 1, 4096] - - [820, 9555.55] + - [956, 9555.55] - - [4096, 2499, 1, 1024] - - [804, 9669.45] + - [940, 9669.45] - - [1024, 3248, 1, 4096] - - [801, 8811.94] + - [937, 8811.94] - - [4096, 3423, 1, 1024] - - [815, 9729.77] + - [951, 9729.77] - - [64, 111, 576, 111] - - [850, 5982.73] + - [986, 5982.73] - - [4096, 3297, 1, 1024] - - [803, 9865.29] + - [939, 9865.29] - - [4096, 3154, 1, 1024] - - [815, 9613.52] + - [951, 9613.52] - - [1024, 3303, 1, 4096] - - [802, 8951.89] + - [938, 8951.89] - - [1024, 3222, 1, 4096] - - [822, 8682.99] + - [958, 8682.99] - - [1024, 3978, 1, 1024] - - [806, 9235.03] + - [942, 9235.03] - - [4096, 3529, 1, 1024] - - [804, 9831.72] + - [940, 9831.72] - - [4096, 3386, 1, 1024] - - [814, 9755.77] + - [950, 9755.77] - - [64, 134, 480, 134] - - [829, 5990.63] + - [965, 5990.63] - - [1024, 3451, 1, 4096] - - [820, 9277.71] + - [956, 9277.71] - - [4096, 3562, 1, 1024] - - [804, 9908.92] + - [940, 9908.92] - - [4096, 3276, 1, 1024] - - [803, 9818.14] + - [939, 9818.14] - - [64, 135, 480, 132] - - [858, 6071.87] + - [994, 6071.87] - - [1024, 3894, 1, 33708] - - [803, 9487.89] + - [939, 9487.89] - - [64, 134, 480, 132] - - [857, 6091.75] + - [993, 6091.75] - - [4096, 3540, 1, 1024] - - [804, 9862.89] + - [940, 9862.89] - - [1024, 3416, 1, 4096] - - [822, 9206.27] + - [958, 9206.27] - - [1024, 4005, 1, 33708] - - [803, 9757.29] + - [939, 9757.29] - - [1024, 3942, 1, 4096] - - [806, 9455.85] + - [942, 9455.85] - - [4096, 3403, 1, 1024] - - [814, 9739.46] + - [950, 9739.46] - - [4096, 3381, 1, 1024] - - [815, 9760.14] + - [951, 9760.14] - - [1024, 3492, 1, 4096] - - [819, 9391.79] + - [955, 9391.79] - - [4096, 3101, 1, 1024] - - [815, 9626.02] + - [951, 9626.02] - - [1024, 3430, 1, 4096] - - [823, 9232.14] + - [959, 9232.14] - - [1024, 3977, 1, 4096] - - [806, 9563.0] + - [942, 9563.0] - - [1024, 3640, 1, 4096] - - [805, 8761.5] + - [941, 8761.5] - - [4096, 3557, 1, 1024] - - [804, 9905.52] + - [940, 9905.52] - - [4096, 3414, 1, 1024] - - [800, 9755.49] + - [936, 9755.49] - - [1024, 3391, 1, 4096] - - [823, 9142.66] + - [959, 9142.66] - - [64, 134, 480, 135] - - [832, 5922.15] + - [968, 5922.15] - - [64, 16, 3840, 16] - - [848, 2080.61] + - [984, 2080.61] - - [1024, 3356, 1, 4096] - - [823, 9051.09] + - [959, 9051.09] - - [4096, 3320, 1, 1024] - - [804, 9929.57] + - [940, 9929.57] - - [4096, 2765, 1, 1024] - - [804, 9750.28] + - [940, 9750.28] - - [64, 162, 400, 162] - - [821, 6515.29] + - [957, 6515.29] - - [1024, 3411, 1, 4096] - - [823, 9185.72] + - [959, 9185.72] - - [1024, 3978, 1, 4096] - - [803, 9562.77] + - [939, 9562.77] - - [4096, 3487, 1, 1024] - - [804, 9733.85] + - [940, 9733.85] - - [4096, 3520, 1, 1024] - - [803, 9813.95] + - [939, 9813.95] - - [4096, 3942, 1, 1024] - - [814, 9804.39] + - [950, 9804.39] - - [4096, 3431, 1, 1024] - - [799, 9819.06] + - [935, 9819.06] - - [1024, 3271, 1, 4096] - - [816, 8913.08] + - [952, 8913.08] - - [4096, 4020, 1, 1024] - - [803, 9831.42] + - [939, 9831.42] - - [1024, 3481, 1, 4096] - - [819, 9376.15] + - [955, 9376.15] - - [1024, 3419, 1, 4096] - - [822, 9208.68] + - [958, 9208.68] - - [1024, 4059, 1, 4096] - - [806, 9733.83] + - [942, 9733.83] - - [4096, 3345, 1, 1024] - - [815, 9651.43] + - [951, 9651.43] - - [4096, 3394, 1, 1024] - - [815, 9780.43] + - [951, 9780.43] - - [1024, 3298, 1, 4096] - - [822, 8889.63] + - [958, 8889.63] - - [4096, 3235, 1, 1024] - - [804, 9705.81] + - [940, 9705.81] - - [1024, 3681, 1, 33708] - - [811, 9146.22] + - [947, 9146.22] - - [1024, 3840, 1, 4096] - - [804, 9253.95] + - [940, 9253.95] - - [1024, 3362, 1, 4096] - - [823, 9059.81] + - [959, 9059.81] - - [4096, 3467, 1, 1024] - - [803, 9677.51] + - [939, 9677.51] - - [1024, 3349, 1, 4096] - - [823, 9034.07] + - [959, 9034.07] - - [1024, 3460, 1, 4096] - - [823, 9322.94] + - [959, 9322.94] - - [4096, 3214, 1, 1024] - - [804, 9644.46] + - [940, 9644.46] - - [1024, 3398, 1, 4096] - - [823, 9157.29] + - [959, 9157.29] - - [4096, 3478, 1, 1024] - - [803, 9706.66] + - [939, 9706.66] - - [1024, 4050, 1, 33708] - - [803, 9865.14] + - [939, 9865.14] - - [1024, 3244, 1, 4096] - - [819, 8744.53] + - [955, 8744.53] - - [4096, 3341, 1, 1024] - - [815, 9646.79] + - [951, 9646.79] - - [4096, 3454, 1, 1024] - - [800, 9880.56] + - [936, 9880.56] - - [1024, 3166, 1, 4096] - - [817, 8618.46] + - [953, 8618.46] - - [1024, 3425, 1, 4096] - - [823, 9225.32] + - [959, 9225.32] - - [4096, 3295, 1, 1024] - - [804, 9863.81] + - [940, 9863.81] - - [4096, 3072, 1, 1024] - - [803, 9971.09] + - [939, 9971.09] - - [4096, 3822, 1, 1024] - - [804, 9952.07] + - [940, 9952.07] - - [1024, 3681, 1, 4096] - - [805, 8856.94] + - [941, 8856.94] - - [1024, 4050, 1, 4096] - - [805, 9717.58] + - [941, 9717.58] - - [4096, 3495, 1, 1024] - - [803, 9741.14] + - [939, 9741.14] - - [4096, 3560, 1, 1024] - - [804, 9909.14] + - [940, 9909.14] - - [1024, 3524, 1, 4096] - - [822, 9503.2] + - [958, 9503.2] - - [1024, 3942, 1, 33708] - - [803, 9602.67] + - [939, 9602.67] - - [1024, 3304, 1, 4096] - - [802, 8928.76] + - [938, 8928.76] - - [1024, 3387, 1, 4096] - - [823, 9127.65] + - [959, 9127.65] - - [1024, 3498, 1, 4096] - - [822, 9423.39] + - [958, 9423.39] - - [4096, 3458, 1, 1024] - - [803, 9642.63] + - [939, 9642.63] - - [4096, 2967, 1, 1024] - - [803, 9626.71] + - [939, 9626.71] - - [64, 8, 7280, 8] - - [825, 1032.61] + - [961, 1032.61] - - [4096, 3385, 1, 1024] - - [799, 9735.77] + - [935, 9735.77] - - [4096, 3434, 1, 1024] - - [814, 9808.9] + - [950, 9808.9] - - [1024, 3519, 1, 4096] - - [823, 9484.83] + - [959, 9484.83] - - [1024, 3511, 1, 4096] - - [823, 9456.47] + - [959, 9456.47] - - [1024, 3288, 1, 4096] - - [822, 8864.05] + - [958, 8864.05] - - [1024, 2918, 1, 4096] - - [805, 9170.35] + - [941, 9170.35] - - [4096, 3573, 1, 1024] - - [804, 9945.85] + - [940, 9945.85] - - [1024, 3822, 1, 33708] - - [813, 9331.0] + - [949, 9331.0] - - [64, 102, 624, 102] - - [850, 5531.17] + - [986, 5531.17] - - [4096, 3539, 1, 1024] - - [804, 9855.39] + - [940, 9855.39] - - [4096, 3332, 1, 1024] - - [815, 9648.97] + - [951, 9648.97] - - [4096, 3286, 1, 1024] - - [804, 9846.42] + - [940, 9846.42] - - [1024, 4026, 1, 4096] - - [805, 9675.94] + - [941, 9675.94] - - [1024, 3277, 1, 4096] - - [819, 8836.21] + - [955, 8836.21] - - [1024, 3471, 1, 4096] - - [823, 9346.33] + - [959, 9346.33] - - [4096, 3518, 1, 1024] - - [804, 9804.2] + - [940, 9804.2] - - [1024, 3393, 1, 4096] - - [823, 9148.99] + - [959, 9148.99] - - [4096, 3413, 1, 1024] - - [800, 9785.17] + - [936, 9785.17] - - [4096, 3303, 1, 1024] - - [804, 9884.37] + - [940, 9884.37] - - [1024, 3207, 1, 4096] - - [801, 8714.69] + - [937, 8714.69] - - [1024, 3894, 1, 1024] - - [817, 9181.51] + - [953, 9181.51] - - [1024, 3977, 1, 1024] - - [817, 9240.9] + - [953, 9240.9] - - [64, 135, 480, 133] - - [832, 5923.4] + - [968, 5923.4] - - [4096, 3535, 1, 1024] - - [804, 9839.55] + - [940, 9839.55] - - [4096, 3376, 1, 1024] - - [799, 9712.02] + - [935, 9712.02] - - [1024, 3355, 1, 4096] - - [823, 9043.27] + - [959, 9043.27] - - [64, 27, 2336, 27] - - [851, 2929.9] + - [987, 2929.9] - - [1024, 3466, 1, 4096] - - [823, 9339.1] + - [959, 9339.1] - - [4096, 3266, 1, 1024] - - [804, 9789.29] + - [940, 9789.29] - - [1024, 3404, 1, 4096] - - [823, 9176.76] + - [959, 9176.76] - - [1024, 3999, 1, 1024] - - [816, 9391.91] + - [952, 9391.91] - - [64, 148, 432, 143] - - [829, 6182.92] + - [965, 6182.92] - - [4096, 3498, 1, 1024] - - [803, 9764.56] + - [939, 9764.56] - - [1024, 4032, 1, 1024] - - [801, 9402.03] + - [937, 9402.03] - - [1024, 3410, 1, 4096] - - [822, 9183.5] + - [958, 9183.5] - - [4096, 3393, 1, 1024] - - [815, 9695.49] + - [951, 9695.49] - - [1024, 3140, 1, 4096] - - [816, 8504.86] + - [952, 8504.86] - - [1024, 3910, 1, 33708] - - [803, 9526.06] + - [939, 9526.06] - - [1024, 3334, 1, 4096] - - [822, 8987.59] + - [958, 8987.59] - - [4096, 3140, 1, 1024] - - [815, 9660.71] + - [951, 9660.71] - - [1024, 4005, 1, 4096] - - [806, 9629.88] + - [942, 9629.88] - - [1024, 3579, 1, 4096] - - [822, 9661.45] + - [958, 9661.45] - - [4096, 3372, 1, 1024] - - [815, 9697.32] + - [951, 9697.32] - - [1024, 3245, 1, 4096] - - [816, 8847.76] + - [952, 8847.76] - - [64, 38, 1680, 38] - - [826, 3340.44] + - [962, 3340.44] - - [4096, 3956, 1, 1024] - - [815, 9911.15] + - [951, 9911.15] - - [4096, 3213, 1, 1024] - - [803, 9643.11] + - [939, 9643.11] - - [1024, 3361, 1, 4096] - - [823, 9062.24] + - [959, 9062.24] - - [1024, 3536, 1, 4096] - - [822, 9530.65] + - [958, 9530.65] - - [1024, 3968, 1, 1024] - - [817, 9377.92] + - [953, 9377.92] - - [4096, 3477, 1, 1024] - - [804, 9700.77] + - [940, 9700.77] - - [4096, 3526, 1, 1024] - - [804, 9824.41] + - [940, 9824.41] - - [1024, 4005, 1, 1024] - - [801, 9362.39] + - [937, 9362.39] - - [1024, 3530, 1, 4096] - - [820, 9487.17] + - [956, 9487.17] - - [1024, 3944, 1, 4096] - - [805, 9464.55] + - [941, 9464.55] - - [4096, 3453, 1, 1024] - - [814, 9826.77] + - [950, 9826.77] - - [4096, 3184, 1, 1024] - - [815, 9833.59] + - [951, 9833.59] - - [4096, 3579, 1, 1024] - - [804, 9962.55] + - [940, 9962.55] - - [4096, 3351, 1, 1024] - - [815, 9653.34] + - [951, 9653.34] - - [4096, 3416, 1, 1024] - - [799, 9810.4] + - [935, 9810.4] - - [64, 100, 624, 100] - - [850, 5408.55] + - [986, 5408.55] - - [1024, 3822, 1, 4096] - - [805, 9196.2] + - [941, 9196.2] - - [1024, 3796, 1, 4096] - - [805, 9131.96] + - [941, 9131.96] - - [4096, 3257, 1, 1024] - - [803, 9767.34] + - [939, 9767.34] - - [4096, 3306, 1, 1024] - - [803, 9893.35] + - [939, 9893.35] - - [1024, 3505, 1, 4096] - - [823, 9450.02] + - [959, 9450.02] - - [1024, 3315, 1, 4096] - - [816, 8979.77] + - [952, 8979.77] - - [1024, 3486, 1, 4096] - - [822, 9393.48] + - [958, 9393.48] - - [4096, 3457, 1, 1024] - - [803, 9653.19] + - [939, 9653.19] - - [4096, 3870, 1, 1024] - - [800, 9717.51] + - [936, 9717.51] - - [1024, 3447, 1, 4096] - - [823, 9273.14] + - [959, 9273.14] - - [1024, 3558, 1, 4096] - - [820, 9567.33] + - [956, 9567.33] - - [4096, 3433, 1, 1024] - - [800, 9759.26] + - [936, 9759.26] - - [4096, 3180, 1, 1024] - - [815, 9738.63] + - [951, 9738.63] - - [1024, 3213, 1, 4096] - - [801, 8692.25] + - [937, 8692.25] - - [1024, 3900, 1, 4096] - - [805, 9388.61] + - [941, 9388.61] - - [4096, 3444, 1, 1024] - - [814, 9869.73] + - [950, 9869.73] - - [1024, 3504, 1, 4096] - - [823, 9429.38] + - [959, 9429.38] - - [4096, 4059, 1, 1024] - - [804, 9920.79] + - [940, 9920.79] - - [1024, 3442, 1, 4096] - - [823, 9273.01] + - [959, 9273.01] - - [4096, 3517, 1, 1024] - - [803, 9808.19] + - [939, 9808.19] - - [1024, 3566, 1, 4096] - - [822, 9622.89] + - [958, 9622.89] - - [4096, 3248, 1, 1024] - - [803, 9730.33] + - [939, 9730.33] - - [1024, 3547, 1, 4096] - - [822, 9564.73] + - [958, 9564.73] - - [64, 59, 1088, 59] - - [841, 4611.76] + - [977, 4611.76] - - [1024, 3340, 1, 4096] - - [822, 8992.21] + - [958, 8992.21] - - [4096, 3480, 1, 1024] - - [804, 9710.17] + - [940, 9710.17] - - [1024, 3968, 1, 4096] - - [804, 9543.11] + - [940, 9543.11] - - [4096, 3424, 1, 1024] - - [800, 9808.66] + - [936, 9808.66] - - [1024, 3906, 1, 1024] - - [802, 9150.54] + - [938, 9150.54] - - [4096, 3265, 1, 1024] - - [803, 9786.85] + - [939, 9786.85] - - [1024, 3384, 1, 4096] - - [823, 9119.56] + - [959, 9119.56] - - [1024, 3494, 1, 4096] - - [820, 9415.52] + - [956, 9415.52] - - [1024, 3236, 1, 4096] - - [817, 8767.14] + - [953, 8767.14] - - [4096, 3497, 1, 1024] - - [804, 9750.86] + - [940, 9750.86] - - [4096, 3354, 1, 1024] - - [815, 9665.17] + - [951, 9665.17] - - [4096, 3055, 1, 1024] - - [804, 9884.09] + - [940, 9884.09] - - [64, 11, 5456, 11] - - [827, 1368.34] + - [963, 1368.34] - - [4096, 3244, 1, 1024] - - [803, 9720.02] + - [939, 9720.02] - - [4096, 3139, 1, 1024] - - [814, 9737.06] + - [950, 9737.06] - - [4096, 3508, 1, 1024] - - [803, 9771.66] + - [939, 9771.66] - - [4096, 4050, 1, 1024] - - [803, 9898.79] + - [939, 9898.79] - - [1024, 3472, 1, 4096] - - [822, 9353.83] + - [958, 9353.83] - - [1024, 3861, 1, 1024] - - [801, 9061.32] + - [937, 9061.32] - - [1024, 3910, 1, 1024] - - [805, 9043.54] + - [941, 9043.54] - - [4096, 3371, 1, 1024] - - [815, 9738.24] + - [951, 9738.24] - - [64, 65, 992, 65] - - [854, 4354.59] + - [990, 4354.59] - - [1024, 3751, 1, 4096] - - [804, 9018.74] + - [940, 9018.74] - - [4096, 3325, 1, 1024] - - [803, 9958.73] + - [939, 9958.73] - - [1024, 3321, 1, 4096] - - [823, 8952.55] + - [959, 8952.55] - - [1024, 3944, 1, 1024] - - [802, 9117.35] + - [938, 9117.35] - - [4096, 3525, 1, 1024] - - [804, 9822.14] + - [940, 9822.14] - - [4096, 3382, 1, 1024] - - [815, 9720.21] + - [951, 9720.21] - - [64, 122, 528, 122] - - [850, 6389.33] + - [986, 6389.33] - - [1024, 3453, 1, 4096] - - [820, 9305.03] + - [956, 9305.03] - - [4096, 3564, 1, 1024] - - [803, 9911.32] + - [939, 9911.32] - - [4096, 3288, 1, 1024] - - [803, 9841.17] + - [939, 9841.17] - - [1024, 3925, 1, 4096] - - [804, 9418.95] + - [940, 9418.95] - - [1024, 3057, 1, 4096] - - [805, 9590.51] + - [941, 9590.51] - - [4096, 3488, 1, 1024] - - [804, 9732.5] + - [940, 9732.5] - - [4096, 3046, 1, 1024] - - [804, 9850.72] + - [940, 9850.72] - - [1024, 3189, 1, 4096] - - [816, 8677.02] + - [952, 8677.02] - - [4096, 3399, 1, 1024] - - [800, 9673.09] + - [936, 9673.09] - - [1024, 3383, 1, 4096] - - [823, 9102.37] + - [959, 9102.37] - - [1024, 3415, 1, 4096] - - [823, 9216.37] + - [959, 9216.37] - - [1024, 3388, 1, 4096] - - [823, 9127.53] + - [959, 9127.53] - - [1024, 3376, 1, 4096] - - [820, 9090.53] + - [956, 9090.53] - - [1024, 3473, 1, 4096] - - [823, 9354.12] + - [959, 9354.12] - - [4096, 3162, 1, 1024] - - [799, 9694.83] + - [935, 9694.83] - - [1024, 3448, 1, 4096] - - [823, 9283.45] + - [959, 9283.45] - - [4096, 3362, 1, 1024] - - [815, 9673.33] + - [951, 9673.33] - - [64, 228, 272, 228] - - [808, 7039.13] + - [944, 7039.13] - - [1024, 3262, 1, 4096] - - [817, 8850.84] + - [953, 8850.84] - - [1024, 3184, 1, 4096] - - [802, 8625.37] + - [938, 8625.37] - - [1024, 3378, 1, 4096] - - [822, 9105.27] + - [958, 9105.27] - - [4096, 3548, 1, 1024] - - [803, 9877.83] + - [939, 9877.83] - - [4096, 2977, 1, 1024] - - [803, 9647.81] + - [939, 9647.81] - - [64, 21, 2976, 21] - - [838, 2364.81] + - [974, 2364.81] - - [64, 112, 576, 111] - - [837, 5973.68] + - [973, 5973.68] - - [4096, 3443, 1, 1024] - - [799, 9784.5] + - [935, 9784.5] - - [1024, 3289, 1, 4096] - - [823, 8874.04] + - [959, 8874.04] - - [1024, 3483, 1, 4096] - - [819, 9380.57] + - [955, 9380.57] - - [4096, 3190, 1, 1024] - - [815, 9850.96] + - [951, 9850.96] - - [1024, 3421, 1, 4096] - - [823, 9214.06] + - [959, 9214.06] - - [1024, 3514, 1, 4096] - - [822, 9458.23] + - [958, 9458.23] - - [1024, 3532, 1, 4096] - - [823, 9513.03] + - [959, 9513.03] - - [1024, 3565, 1, 4096] - - [822, 9630.6] + - [958, 9630.6] - - [4096, 3422, 1, 1024] - - [800, 9733.79] + - [936, 9733.79] - - [4096, 3263, 1, 1024] - - [804, 9776.94] + - [940, 9776.94] - - [4096, 3296, 1, 1024] - - [804, 9860.61] + - [940, 9860.61] - - [4096, 3640, 1, 1024] - - [814, 9782.3] + - [950, 9782.3] - - [4096, 3463, 1, 1024] - - [803, 9672.0] + - [939, 9672.0] - - [4096, 3528, 1, 1024] - - [804, 9829.98] + - [940, 9829.98] - - [1024, 3351, 1, 4096] - - [817, 9054.37] + - [953, 9054.37] - - [1024, 3462, 1, 4096] - - [823, 9327.85] + - [959, 9327.85] - - [4096, 3226, 1, 1024] - - [804, 9674.93] + - [940, 9674.93] - - [4096, 3439, 1, 1024] - - [799, 9823.18] + - [935, 9823.18] - - [4096, 3121, 1, 1024] - - [799, 9672.64] + - [935, 9672.64] - - [1024, 4059, 1, 33708] - - [803, 9885.72] + - [939, 9885.72] - - [1024, 3311, 1, 4096] - - [823, 8910.01] + - [959, 8910.01] - - [1024, 3230, 1, 4096] - - [823, 8705.9] + - [959, 8705.9] - - [4096, 3353, 1, 1024] - - [815, 9671.86] + - [951, 9671.86] - - [4096, 3402, 1, 1024] - - [800, 9727.04] + - [936, 9727.04] - - [1024, 3427, 1, 4096] - - [823, 9233.55] + - [959, 9233.55] - - [1024, 3346, 1, 4096] - - [823, 9015.77] + - [959, 9015.77] - - [1024, 3126, 1, 4096] - - [817, 8519.31] + - [953, 8519.31] - - [1024, 3796, 1, 1024] - - [801, 8916.75] + - [937, 8916.75] - - [1024, 3990, 1, 4096] - - [805, 9600.86] + - [941, 9600.86] - - [1024, 3257, 1, 4096] - - [801, 8790.42] + - [937, 8790.42] - - [4096, 3996, 1, 1024] - - [804, 9788.25] + - [940, 9788.25] - - [64, 143, 432, 143] - - [832, 6087.24] + - [968, 6087.24] - - [1024, 3306, 1, 4096] - - [816, 9035.69] + - [952, 9035.69] - - [1024, 3389, 1, 4096] - - [823, 9134.92] + - [959, 9134.92] - - [1024, 3500, 1, 4096] - - [823, 9443.33] + - [959, 9443.33] - - [1024, 3999, 1, 33708] - - [804, 9741.24] + - [940, 9741.24] - - [4096, 3486, 1, 1024] - - [804, 9719.67] + - [940, 9719.67] - - [1024, 3438, 1, 4096] - - [823, 9259.38] + - [959, 9259.38] - - [4096, 3616, 1, 1024] - - [814, 9739.77] + - [950, 9739.77] - - [1024, 3955, 1, 1024] - - [816, 9260.37] + - [952, 9260.37] - - [4096, 3430, 1, 1024] - - [815, 9819.95] + - [951, 9819.95] - - [4096, 3271, 1, 1024] - - [804, 9802.04] + - [940, 9802.04] - - [1024, 3364, 1, 4096] - - [816, 9144.63] + - [952, 9144.63] - - [64, 54, 1184, 54] - - [836, 4315.78] + - [972, 4315.78] - - [1024, 3497, 1, 4096] - - [823, 9429.42] + - [959, 9429.42] - - [4096, 3503, 1, 1024] - - [803, 9764.48] + - [939, 9764.48] - - [4096, 3344, 1, 1024] - - [800, 9614.16] + - [936, 9614.16] - - [1024, 3457, 1, 4096] - - [823, 9320.6] + - [959, 9320.6] - - [4096, 3466, 1, 1024] - - [803, 9677.81] + - [939, 9677.81] - - [1024, 3976, 1, 33708] - - [804, 9685.38] + - [940, 9685.38] - - [1024, 3395, 1, 4096] - - [822, 9146.39] + - [958, 9146.39] - - [4096, 3361, 1, 1024] - - [814, 9677.89] + - [950, 9677.89] - - [1024, 3751, 1, 33708] - - [812, 9234.69] + - [948, 9234.69] - - [1024, 3822, 1, 1024] - - [801, 8977.83] + - [937, 8977.83] - - [4096, 3315, 1, 1024] - - [804, 9922.54] + - [940, 9922.54] - - [1024, 3163, 1, 4096] - - [816, 8577.79] + - [952, 8577.79] - - [4096, 3547, 1, 1024] - - [804, 9882.92] + - [940, 9882.92] - - [4096, 3340, 1, 1024] - - [814, 9635.42] + - [950, 9635.42] - - [1024, 3296, 1, 4096] - - [823, 8874.66] + - [959, 8874.66] - - [1024, 3468, 1, 4096] - - [823, 9350.26] + - [959, 9350.26] - - [4096, 3294, 1, 1024] - - [803, 9856.87] + - [939, 9856.87] - - [1024, 3406, 1, 4096] - - [819, 9162.84] + - [955, 9162.84] - - [1024, 3860, 1, 33708] - - [803, 9403.56] + - [939, 9403.56] - - [1024, 3584, 1, 4096] - - [820, 9677.44] + - [956, 9677.44] - - [4096, 3189, 1, 1024] - - [815, 9820.69] + - [951, 9820.69] - - [4096, 3494, 1, 1024] - - [803, 9747.68] + - [939, 9747.68] - - [64, 135, 480, 135] - - [829, 5966.34] + - [965, 5966.34] - - [1024, 3093, 1, 4096] - - [817, 8446.06] + - [953, 8446.06] - - [4096, 3421, 1, 1024] - - [800, 9776.03] + - [936, 9776.03] - - [1024, 3479, 1, 4096] - - [823, 9376.54] + - [959, 9376.54] - - [1024, 3433, 1, 4096] - - [823, 9251.14] + - [959, 9251.14] - - [4096, 3311, 1, 1024] - - [803, 9901.53] + - [939, 9901.53] - - [1024, 3381, 1, 4096] - - [823, 9103.99] + - [959, 9103.99] - - [1024, 3996, 1, 4096] - - [804, 9609.56] + - [940, 9609.56] - - [4096, 3384, 1, 1024] - - [814, 9750.01] + - [950, 9750.01] - - [1024, 3247, 1, 4096] - - [802, 8872.59] + - [938, 8872.59] - - [1024, 3169, 1, 4096] - - [801, 8597.61] + - [937, 8597.61] - - [1024, 3088, 1, 4096] - - [817, 8410.07] + - [953, 8410.07] - - [1024, 3363, 1, 4096] - - [823, 9069.5] + - [959, 9069.5] - - [1024, 3538, 1, 4096] - - [822, 9529.68] + - [958, 9529.68] - - [1024, 3996, 1, 1024] - - [806, 9323.06] + - [942, 9323.06] - - [4096, 3169, 1, 1024] - - [800, 9821.4] + - [936, 9821.4] - - [4096, 3538, 1, 1024] - - [803, 9859.42] + - [939, 9859.42] - - [4096, 3401, 1, 1024] - - [800, 9754.5] + - [936, 9754.5] - - [4096, 3581, 1, 1024] - - [803, 9960.71] + - [939, 9960.71] - - [1024, 3180, 1, 4096] - - [801, 8635.05] + - [937, 8635.05] - - [1024, 3870, 1, 1024] - - [802, 9085.69] + - [938, 9085.69] - - [4096, 3555, 1, 1024] - - [803, 9905.74] + - [939, 9905.74] - - [4096, 3412, 1, 1024] - - [815, 9778.56] + - [951, 9778.56] - - [4096, 3302, 1, 1024] - - [803, 9888.71] + - [939, 9888.71] - - [1024, 3561, 1, 4096] - - [819, 9597.05] + - [955, 9597.05] - - [1024, 3302, 1, 4096] - - [823, 8900.87] + - [959, 8900.87] - - [1024, 3976, 1, 4096] - - [805, 9563.22] + - [941, 9563.22] - - [4096, 3485, 1, 1024] - - [803, 9722.57] + - [939, 9722.57] - - [4096, 3534, 1, 1024] - - [803, 9847.22] + - [939, 9847.22] - - [1024, 3110, 1, 4096] - - [816, 8458.56] + - [952, 8458.56] - - [1024, 3401, 1, 4096] - - [823, 9174.81] + - [959, 9174.81] - - [4096, 3216, 1, 1024] - - [803, 9645.49] + - [939, 9645.49] - - [1024, 4020, 1, 33708] - - [803, 9793.61] + - [939, 9793.61] - - [1024, 3215, 1, 4096] - - [823, 8677.51] + - [959, 8677.51] - - [4096, 3566, 1, 1024] - - [803, 9924.78] + - [939, 9924.78] - - [1024, 3137, 1, 4096] - - [801, 8547.07] + - [937, 8547.07] - - [4096, 3359, 1, 1024] - - [800, 9673.73] + - [936, 9673.73] - - [4096, 3392, 1, 1024] - - [815, 9757.51] + - [951, 9757.51] - - [1024, 3506, 1, 4096] - - [823, 9443.0] + - [959, 9443.0] - - [4096, 3233, 1, 1024] - - [803, 9698.7] + - [939, 9698.7] - - [1024, 3444, 1, 4096] - - [823, 9275.54] + - [959, 9275.54] - - [1024, 3975, 1, 4096] - - [804, 9556.87] + - [940, 9556.87] - - [1024, 3870, 1, 33708] - - [803, 9427.44] + - [939, 9427.44] - - [4096, 3465, 1, 1024] - - [804, 9675.01] + - [940, 9675.01] - - [4096, 3968, 1, 1024] - - [800, 9927.93] + - [936, 9927.93] - - [1024, 3523, 1, 4096] - - [823, 9494.15] + - [959, 9494.15] - - [64, 10, 5952, 10] - - [827, 1224.16] + - [963, 1224.16] - - [4096, 3990, 1, 1024] - - [803, 9771.27] + - [939, 9771.27] - - [1024, 3549, 1, 4096] - - [822, 9553.42] + - [958, 9553.42] - - [1024, 3342, 1, 4096] - - [823, 9007.31] + - [959, 9007.31] - - [4096, 3476, 1, 1024] - - [803, 9703.66] + - [939, 9703.66] - - [64, 232, 272, 228] - - [809, 7078.93] + - [945, 7078.93] - - [1024, 3418, 1, 4096] - - [823, 9213.09] + - [959, 9213.09] - - [1024, 3859, 1, 1024] - - [802, 9087.54] + - [938, 9087.54] - - [4096, 3339, 1, 1024] - - [815, 9594.0] + - [951, 9594.0] - - [4096, 3452, 1, 1024] - - [800, 9872.69] + - [936, 9872.69] - - [4096, 3293, 1, 1024] - - [803, 9842.65] + - [939, 9842.65] - - [4096, 3840, 1, 1024] - - [804, 10030.8] + - [940, 10030.8] - - [1024, 3369, 1, 4096] - - [801, 9099.72] + - [937, 9099.72] - - [64, 193, 320, 193] - - [831, 6425.8] + - [967, 6425.8] - - [1024, 3544, 1, 4096] - - [820, 9556.64] + - [956, 9556.64] - - [4096, 3493, 1, 1024] - - [804, 9743.34] + - [940, 9743.34] - - [4096, 3350, 1, 1024] - - [815, 9653.11] + - [951, 9653.11] - - [64, 71, 896, 71] - - [855, 4686.73] + - [991, 4686.73] - - [4096, 3256, 1, 1024] - - [803, 9763.78] + - [939, 9763.78] - - [1024, 3870, 1, 4096] - - [805, 9305.28] + - [941, 9305.28] - - [4096, 4012, 1, 1024] - - [804, 9817.35] + - [940, 9817.35] - - [1024, 3280, 1, 4096] - - [823, 8842.02] + - [959, 8842.02] - - [4096, 3456, 1, 1024] - - [799, 9874.43] + - [935, 9874.43] - - [1024, 3555, 1, 4096] - - [822, 9599.63] + - [958, 9599.63] - - [4096, 3014, 1, 1024] - - [803, 9762.28] + - [939, 9762.28] - - [1024, 3474, 1, 4096] - - [823, 9373.67] + - [959, 9373.67] - - [4096, 3367, 1, 1024] - - [799, 9694.64] + - [935, 9694.64] - - [4096, 3432, 1, 1024] - - [815, 9855.27] + - [951, 9855.27] - - [64, 84, 752, 84] - - [842, 5247.18] + - [978, 5247.18] - - [4096, 3273, 1, 1024] - - [804, 9801.87] + - [940, 9801.87] - - [4096, 3130, 1, 1024] - - [800, 9672.52] + - [936, 9672.52] - - [1024, 2984, 1, 4096] - - [805, 9403.7] + - [941, 9403.7] - - [1024, 3995, 1, 1024] - - [817, 9392.61] + - [953, 9392.61] - - [1024, 3517, 1, 4096] - - [823, 9481.39] + - [959, 9481.39] - - [1024, 3455, 1, 4096] - - [823, 9302.29] + - [959, 9302.29] - - [1024, 3939, 1, 4096] - - [805, 9469.89] + - [941, 9469.89] - - [64, 49, 1296, 49] - - [835, 3938.96] + - [971, 3938.96] - - [64, 14, 4368, 14] - - [827, 1802.47] + - [963, 1802.47] - - [64, 25, 2512, 25] - - [846, 2760.54] + - [982, 2760.54] - - [4096, 3147, 1, 1024] - - [815, 9713.03] + - [951, 9713.03] - - [4096, 3516, 1, 1024] - - [803, 9805.93] + - [939, 9805.93] - - [1024, 3876, 1, 4096] - - [805, 9320.56] + - [941, 9320.56] - - [1024, 3191, 1, 4096] - - [802, 8640.76] + - [938, 8640.76] - - [4096, 3411, 1, 1024] - - [814, 9737.37] + - [950, 9737.37] - - [1024, 3337, 1, 4096] - - [823, 8990.13] + - [959, 8990.13] - - [1024, 3512, 1, 4096] - - [823, 9459.65] + - [959, 9459.65] - - [4096, 3301, 1, 1024] - - [803, 9877.26] + - [939, 9877.26] - - [1024, 3450, 1, 4096] - - [822, 9283.11] + - [958, 9283.11] - - [4096, 3533, 1, 1024] - - [803, 9848.62] + - [939, 9848.62] - - [4096, 3390, 1, 1024] - - [815, 9764.61] + - [951, 9764.61] - - [4096, 3231, 1, 1024] - - [803, 9693.81] + - [939, 9693.81] - - [1024, 2499, 1, 4096] - - [822, 9304.81] + - [958, 9304.81] - - [1024, 3186, 1, 4096] - - [802, 8649.55] + - [938, 8649.55] - - [1024, 3380, 1, 4096] - - [823, 9101.77] + - [959, 9101.77] - - [4096, 3496, 1, 1024] - - [804, 9754.3] + - [940, 9754.3] - - [1024, 3956, 1, 33708] - - [803, 9636.77] + - [939, 9636.77] - - [1024, 3976, 1, 1024] - - [805, 9248.41] + - [941, 9248.41] - - [4096, 2736, 1, 1024] - - [803, 9651.91] + - [939, 9651.91] - - [1024, 3291, 1, 4096] - - [823, 8868.94] + - [959, 8868.94] - - [1024, 3944, 1, 33708] - - [804, 9607.0] + - [940, 9607.0] - - [1024, 3485, 1, 4096] - - [822, 9385.96] + - [958, 9385.96] - - [4096, 3138, 1, 1024] - - [800, 9672.15] + - [936, 9672.15] - - [1024, 3423, 1, 4096] - - [823, 9222.77] + - [959, 9222.77] - - [1024, 3491, 1, 4096] - - [823, 9405.02] + - [959, 9405.02] - - [1024, 3860, 1, 4096] - - [806, 9282.94] + - [942, 9282.94] - - [4096, 3211, 1, 1024] - - [803, 9640.42] + - [939, 9640.42] - - [1024, 3221, 1, 4096] - - [817, 8709.4] + - [953, 8709.4] - - [1024, 2917, 1, 4096] - - [805, 9177.11] + - [941, 9177.11] - - [4096, 3475, 1, 1024] - - [803, 9703.45] + - [939, 9703.45] - - [4096, 3524, 1, 1024] - - [803, 9816.23] + - [939, 9816.23] - - [4096, 2985, 1, 1024] - - [804, 9686.91] + - [940, 9686.91] - - [1024, 3480, 1, 4096] - - [823, 9380.2] + - [959, 9380.2] - - [4096, 3222, 1, 1024] - - [803, 9666.8] + - [939, 9666.8] - - [4096, 3451, 1, 1024] - - [799, 9877.91] + - [935, 9877.91] - - [1024, 3969, 1, 33708] - - [803, 9669.64] + - [939, 9669.64] - - [1024, 3640, 1, 1024] - - [810, 8565.68] + - [946, 8565.68] - - [1024, 3297, 1, 4096] - - [819, 8889.22] + - [955, 8889.22] - - [4096, 3944, 1, 1024] - - [800, 9902.85] + - [936, 9902.85] - - [1024, 3216, 1, 4096] - - [802, 8695.88] + - [938, 8695.88] - - [1024, 3840, 1, 1024] - - [816, 9046.05] + - [952, 9046.05] - - [4096, 3349, 1, 1024] - - [814, 9676.82] + - [950, 9676.82] - - [4096, 3398, 1, 1024] - - [800, 9775.84] + - [936, 9775.84] - - [1024, 3154, 1, 4096] - - [817, 8662.26] + - [953, 8662.26] - - [1024, 3978, 1, 33708] - - [804, 9689.16] + - [940, 9689.16] - - [1024, 3348, 1, 4096] - - [823, 9014.67] + - [959, 9014.67] - - [4096, 3304, 1, 1024] - - [804, 9886.8] + - [940, 9886.8] - - [4096, 4030, 1, 1024] - - [804, 9859.1] + - [940, 9859.1] - - [1024, 4026, 1, 1024] - - [801, 9326.64] + - [937, 9326.64] - - [4096, 3471, 1, 1024] - - [803, 9683.0] + - [939, 9683.0] - - [1024, 3259, 1, 4096] - - [817, 8792.19] + - [953, 8792.19] - - [64, 132, 480, 132] - - [857, 6027.86] + - [993, 6027.86] - - [1024, 3308, 1, 4096] - - [822, 8905.14] + - [958, 8905.14] - - [4096, 3391, 1, 1024] - - [815, 9765.35] + - [951, 9765.35] - - [1024, 3312, 1, 4096] - - [823, 8917.74] + - [959, 8917.74] - - [1024, 3502, 1, 4096] - - [823, 9435.62] + - [959, 9435.62] - - [1024, 3968, 1, 33708] - - [803, 9668.24] + - [939, 9668.24] - - [1024, 3424, 1, 4096] - - [819, 9215.99] + - [955, 9215.99] - - [64, 13, 4672, 13] - - [828, 1662.35] + - [964, 1662.35] - - [4096, 4032, 1, 1024] - - [814, 9877.82] + - [950, 9877.82] - - [1024, 3900, 1, 1024] - - [817, 9116.93] + - [953, 9116.93] - - [4096, 3442, 1, 1024] - - [814, 9773.18] + - [950, 9773.18] - - [1024, 3366, 1, 4096] - - [823, 9079.46] + - [959, 9079.46] - - [4096, 3999, 1, 1024] - - [803, 9786.46] + - [939, 9786.46] - - [1024, 3477, 1, 4096] - - [823, 9364.89] + - [959, 9364.89] - - [1024, 2505, 1, 4096] - - [823, 9304.03] + - [959, 9304.03] - - [4096, 3515, 1, 1024] - - [803, 9797.93] + - [939, 9797.93] - - [1024, 3564, 1, 4096] - - [819, 9632.86] + - [955, 9632.86] - - [4096, 3057, 1, 1024] - - [804, 9880.19] + - [940, 9880.19] - - [1024, 3339, 1, 4096] - - [802, 9029.86] + - [938, 9029.86] - - [4096, 3262, 1, 1024] - - [803, 9780.1] + - [939, 9780.1] - - [1024, 4030, 1, 4096] - - [806, 9682.0] + - [942, 9682.0] - - [1024, 3265, 1, 4096] - - [823, 8797.52] + - [959, 8797.52] - - [1024, 3459, 1, 4096] - - [823, 9313.06] + - [959, 9313.06] - - [4096, 3462, 1, 1024] - - [804, 9669.73] + - [940, 9669.73] - - [64, 85, 752, 85] - - [842, 5186.93] + - [978, 5186.93] - - [1024, 3513, 1, 4096] - - [820, 9469.15] + - [956, 9469.15] - - [1024, 3397, 1, 4096] - - [823, 9151.77] + - [959, 9151.77] - - [4096, 3572, 1, 1024] - - [803, 9945.7] + - [939, 9945.7] - - [4096, 3389, 1, 1024] - - [815, 9740.86] + - [951, 9740.86] - - [4096, 3438, 1, 1024] - - [815, 9822.47] + - [951, 9822.47] - - [64, 102, 624, 100] - - [850, 5487.0] + - [986, 5487.0] - - [1024, 3640, 1, 33708] - - [811, 9083.53] + - [947, 9083.53] - - [1024, 3995, 1, 33708] - - [804, 9731.99] + - [940, 9731.99] - - [1024, 3165, 1, 4096] - - [816, 8601.9] + - [952, 8601.9] - - [4096, 3543, 1, 1024] - - [804, 9868.63] + - [940, 9868.63] - - [4096, 3352, 1, 1024] - - [799, 9668.44] + - [935, 9668.44] - - [1024, 3359, 1, 4096] - - [820, 9050.33] + - [956, 9050.33] - - [1024, 3470, 1, 4096] - - [823, 9355.17] + - [959, 9355.17] - - [64, 15, 4096, 15] - - [827, 1945.43] + - [963, 1945.43] - - [1024, 3392, 1, 4096] - - [822, 9139.71] + - [958, 9139.71] - - [64, 78, 816, 77] - - [834, 4870.56] + - [970, 4870.56] - - [4096, 3137, 1, 1024] - - [799, 9600.22] + - [935, 9600.22] - - [4096, 3506, 1, 1024] - - [804, 9779.08] + - [940, 9779.08] - - [1024, 3095, 1, 4096] - - [816, 8381.24] + - [952, 8381.24] - - [1024, 3859, 1, 4096] - - [803, 9288.63] + - [939, 9288.63] - - [4096, 3369, 1, 1024] - - [815, 9697.73] + - [951, 9697.73] - - [64, 45, 1424, 45] - - [852, 3883.74] + - [988, 3883.74] - - [1024, 3435, 1, 4096] - - [823, 9264.62] + - [959, 9264.62] - - [1024, 3354, 1, 4096] - - [823, 9035.47] + - [959, 9035.47] - - [1024, 3055, 1, 4096] - - [804, 9597.45] + - [940, 9597.45] - - [4096, 3523, 1, 1024] - - [803, 9821.79] + - [939, 9821.79] - - [4096, 3380, 1, 1024] - - [799, 9721.39] + - [935, 9721.39] - - [1024, 3233, 1, 4096] - - [816, 8724.75] + - [952, 8724.75] - - [4096, 3221, 1, 1024] - - [803, 9661.04] + - [939, 9661.04] - - [4096, 3270, 1, 1024] - - [803, 9797.92] + - [939, 9797.92] - - [4096, 3593, 1, 1024] - - [814, 9679.31] + - [950, 9679.31] - - [1024, 3358, 1, 4096] - - [823, 9051.82] + - [959, 9051.82] - - [1024, 3540, 1, 4096] - - [823, 9533.59] + - [959, 9533.59] - - [4096, 3502, 1, 1024] - - [804, 9760.65] + - [940, 9760.65] - - [4096, 2505, 1, 1024] - - [804, 9680.52] + - [940, 9680.52] - - [4096, 3397, 1, 1024] - - [814, 9785.85] + - [950, 9785.85] - - [1024, 3300, 1, 4096] - - [817, 8907.85] + - [953, 8907.85] - - [4096, 3095, 1, 1024] - - [800, 9618.78] + - [936, 9618.78] - - [1024, 3182, 1, 4096] - - [816, 8606.16] + - [952, 8606.16] - - [1024, 3299, 1, 4096] - - [822, 8885.48] + - [958, 8885.48] - - [1024, 3276, 1, 4096] - - [817, 8872.75] + - [953, 8872.75] - - [1024, 3360, 1, 4096] - - [820, 9044.2] + - [956, 9044.2] - - [4096, 3360, 1, 1024] - - [815, 9681.39] + - [951, 9681.39] - - [4096, 2918, 1, 1024] - - [799, 9732.74] + - [935, 9732.74] - - [1024, 3939, 1, 33708] - - [803, 9595.96] + - [939, 9595.96] - - [4096, 3314, 1, 1024] - - [804, 9915.02] + - [940, 9915.02] - - [1024, 3319, 1, 4096] - - [823, 8956.37] + - [959, 8956.37] - - [64, 35, 1808, 35] - - [840, 3060.27] + - [976, 3060.27] - - [1024, 3942, 1, 1024] - - [816, 9211.83] + - [952, 9211.83] - - [1024, 3465, 1, 4096] - - [823, 9340.73] + - [959, 9340.73] - - [4096, 3546, 1, 1024] - - [804, 9875.41] + - [940, 9875.41] - - [1024, 3403, 1, 4096] - - [816, 9224.34] + - [952, 9224.34] - - [1024, 3948, 1, 1024] - - [802, 9245.63] + - [938, 9245.63] - - [4096, 3441, 1, 1024] - - [815, 9758.72] + - [951, 9758.72] - - [1024, 3139, 1, 4096] - - [816, 8582.84] + - [952, 8582.84] - - [1024, 3563, 1, 4096] - - [823, 9620.74] + - [959, 9620.74] - - [1024, 3508, 1, 4096] - - [820, 9449.36] + - [956, 9449.36] - - [1024, 3975, 1, 33708] - - [803, 9683.55] + - [939, 9683.55] - - [1024, 3446, 1, 4096] - - [822, 9289.51] + - [958, 9289.51] - - [1024, 3529, 1, 4096] - - [819, 9491.29] + - [955, 9491.29] - - [64, 112, 576, 112] - - [844, 6387.14] + - [980, 6387.14] - - [4096, 3461, 1, 1024] - - [804, 9663.33] + - [940, 9663.33] - - [1024, 3574, 1, 4096] - - [822, 9662.88] + - [958, 9662.88] - - [1024, 3101, 1, 4096] - - [817, 8468.34] + - [953, 8468.34] - - [1024, 3927, 1, 1024] - - [802, 9207.97] + - [938, 9207.97] - - [4096, 3224, 1, 1024] - - [804, 9665.61] + - [940, 9665.61] - - [4096, 3437, 1, 1024] - - [800, 9857.21] + - [936, 9857.21] - - [4096, 3900, 1, 1024] - - [815, 9826.25] + - [951, 9826.25] - - [1024, 3495, 1, 4096] - - [823, 9412.41] + - [959, 9412.41] - - [1024, 3977, 1, 33708] - - [803, 9687.87] + - [939, 9687.87] - - [1024, 3328, 1, 4096] - - [823, 8975.57] + - [959, 8975.57] - - [4096, 3168, 1, 1024] - - [799, 9754.87] + - [935, 9754.87] - - [1024, 4026, 1, 33708] - - [803, 9807.24] + - [939, 9807.24] - - [1024, 3292, 1, 4096] - - [816, 8901.83] + - [952, 8901.83] - - [1024, 3294, 1, 4096] - - [823, 8877.03] + - [959, 8877.03] - - [4096, 3335, 1, 1024] - - [800, 9616.23] + - [936, 9616.23] - - [4096, 3400, 1, 1024] - - [814, 9710.73] + - [950, 9710.73] - - [1024, 3287, 1, 4096] - - [801, 8908.07] + - [937, 8908.07] - - [1024, 3910, 1, 4096] - - [805, 9401.03] + - [941, 9401.03] - - [1024, 3780, 1, 1024] - - [816, 8863.29] + - [952, 8863.29] - - [4096, 3098, 1, 1024] - - [800, 9606.47] + - [936, 9606.47] - - [1024, 3584, 1, 33708] - - [823, 9775.33] + - [959, 9775.33] - - [64, 29, 2176, 29] - - [845, 3135.03] + - [981, 3135.03] - - [1024, 3371, 1, 4096] - - [801, 9117.81] + - [937, 9117.81] - - [1024, 3546, 1, 4096] - - [823, 9547.3] + - [959, 9547.3] - - [1024, 4012, 1, 1024] - - [805, 9353.73] + - [941, 9353.73] - - [4096, 3505, 1, 1024] - - [803, 9773.17] + - [939, 9773.17] - - [4096, 3554, 1, 1024] - - [803, 9895.59] + - [939, 9895.59] - - [4096, 3063, 1, 1024] - - [803, 9898.98] + - [939, 9898.98] - - [1024, 3900, 1, 33708] - - [804, 9502.93] + - [940, 9502.93] - - [1024, 3345, 1, 4096] - - [823, 9015.85] + - [959, 9015.85] - - [1024, 3357, 1, 4096] - - [823, 9041.23] + - [959, 9041.23] - - [1024, 3282, 1, 4096] - - [816, 8860.17] + - [952, 8860.17] - - [4096, 3484, 1, 1024] - - [804, 9721.33] + - [940, 9721.33] - - [1024, 3557, 1, 4096] - - [820, 9573.48] + - [956, 9573.48] - - [1024, 3476, 1, 4096] - - [823, 9361.72] + - [959, 9361.72] - - [1024, 3751, 1, 1024] - - [817, 8849.11] + - [953, 8849.11] - - [4096, 3379, 1, 1024] - - [800, 9741.49] + - [936, 9741.49] - - [4096, 3428, 1, 1024] - - [799, 9767.82] + - [935, 9767.82] - - [4096, 3126, 1, 1024] - - [814, 9701.9] + - [950, 9701.9] - - [64, 41, 1552, 41] - - [849, 3555.69] + - [985, 3555.69] - - [1024, 3325, 1, 4096] - - [801, 8962.41] + - [937, 8962.41] - - [4096, 3501, 1, 1024] - - [803, 9762.01] + - [939, 9762.01] - - [4096, 3358, 1, 1024] - - [799, 9680.42] + - [935, 9680.42] - - [1024, 3441, 1, 4096] - - [823, 9271.27] + - [959, 9271.27] - - [1024, 3552, 1, 4096] - - [819, 9565.42] + - [955, 9565.42] - - [4096, 3232, 1, 1024] - - [804, 9696.81] + - [940, 9696.81] - - [64, 18, 3440, 18] - - [824, 2059.33] + - [960, 2059.33] - - [1024, 3412, 1, 4096] - - [823, 9199.28] + - [959, 9199.28] - - [1024, 3372, 1, 4096] - - [820, 9083.49] + - [956, 9083.49] - - [1024, 3585, 1, 4096] - - [810, 8710.29] + - [946, 8710.29] - - [4096, 3143, 1, 1024] - - [815, 9692.12] + - [951, 9692.12] - - [4096, 3464, 1, 1024] - - [803, 9661.93] + - [939, 9661.93] - - [1024, 3145, 1, 4096] - - [802, 8526.33] + - [938, 8526.33] - - [4096, 3375, 1, 1024] - - [814, 9734.78] + - [950, 9734.78] - - [4096, 2917, 1, 1024] - - [799, 9714.57] + - [935, 9714.57] - - [4096, 3978, 1, 1024] - - [804, 9741.43] + - [940, 9741.43] - - [1024, 2765, 1, 4096] - - [805, 8706.75] + - [941, 8706.75] - - [64, 148, 432, 148] - - [830, 6372.17] + - [966, 6372.17] - - [1024, 3452, 1, 4096] - - [822, 9301.38] + - [958, 9301.38] - - [4096, 3584, 1, 1024] - - [804, 10005.7] + - [940, 10005.7] - - [4096, 3545, 1, 1024] - - [804, 9877.87] + - [940, 9877.87] - - [1024, 3352, 1, 4096] - - [823, 9035.19] + - [959, 9035.19] - - [64, 159, 400, 160] - - [832, 6952.11] + - [968, 6952.11] - - [4096, 3292, 1, 1024] - - [803, 9856.51] + - [939, 9856.51] - - [1024, 3525, 1, 4096] - - [823, 9501.5] + - [959, 9501.5] - - [1024, 3266, 1, 4096] - - [823, 8817.43] + - [959, 8817.43] - - [1024, 3382, 1, 4096] - - [822, 9101.54] + - [958, 9101.54] - - [4096, 3492, 1, 1024] - - [803, 9747.29] + - [939, 9747.29] - - [4096, 3419, 1, 1024] - - [815, 9745.88] + - [951, 9745.88] - - [1024, 3796, 1, 33708] - - [812, 9356.26] + - [948, 9356.26] - - [1024, 3293, 1, 4096] - - [819, 8868.4] + - [955, 8868.4] - - [4096, 3796, 1, 1024] - - [804, 9885.36] + - [940, 9885.36] - - [1024, 3487, 1, 4096] - - [820, 9391.34] + - [956, 9391.34] - - [4096, 3166, 1, 1024] - - [815, 9718.46] + - [951, 9718.46] - - [64, 102, 624, 101] - - [844, 5547.84] + - [980, 5547.84] - - [1024, 3409, 1, 4096] - - [823, 9187.88] + - [959, 9187.88] - - [1024, 3520, 1, 4096] - - [822, 9485.09] + - [958, 9485.09] - - [1024, 3573, 1, 4096] - - [823, 9652.71] + - [959, 9652.71] - - [4096, 3366, 1, 1024] - - [799, 9684.31] + - [935, 9684.31] - - [4096, 3720, 1, 1024] - - [815, 9703.34] + - [951, 9703.34] - - [4096, 3207, 1, 1024] - - [803, 9626.21] + - [939, 9626.21] - - [4096, 3272, 1, 1024] - - [803, 9795.51] + - [939, 9795.51] - - [1024, 3390, 1, 4096] - - [823, 9125.88] + - [959, 9125.88] - - [4096, 3183, 1, 1024] - - [815, 9825.87] + - [951, 9825.87] - - [4096, 3536, 1, 1024] - - [804, 9846.51] + - [940, 9846.51] - - [4096, 3563, 1, 1024] - - [804, 9913.8] + - [940, 9913.8] - - [1024, 3482, 1, 4096] - - [823, 9376.91] + - [959, 9376.91] - - [4096, 3447, 1, 1024] - - [814, 9875.09] + - [950, 9875.09] - - [4096, 3955, 1, 1024] - - [799, 9922.39] + - [935, 9922.39] - - [4096, 4005, 1, 1024] - - [804, 9803.43] + - [940, 9803.43] - - [1024, 3493, 1, 4096] - - [823, 9411.37] + - [959, 9411.37] - - [4096, 3410, 1, 1024] - - [799, 9788.34] + - [935, 9788.34] - - [1024, 3422, 1, 4096] - - [822, 9216.28] + - [958, 9216.28] - - [1024, 3350, 1, 4096] - - [817, 9068.02] + - [953, 9068.02] - - [4096, 3300, 1, 1024] - - [804, 9883.29] + - [940, 9883.29] - - [4096, 3910, 1, 1024] - - [814, 9800.12] + - [950, 9800.12] - - [1024, 3489, 1, 4096] - - [823, 9398.66] + - [959, 9398.66] - - [4096, 3483, 1, 1024] - - [803, 9715.96] + - [939, 9715.96] - - [4096, 3532, 1, 1024] - - [804, 9837.99] + - [940, 9837.99] - - [64, 101, 624, 101] - - [844, 5452.28] + - [980, 5452.28] - - [4096, 3230, 1, 1024] - - [804, 9683.6] + - [940, 9683.6] - - [4096, 3427, 1, 1024] - - [799, 9760.72] + - [935, 9760.72] - - [1024, 3377, 1, 4096] - - [823, 9101.17] + - [959, 9101.17] - - [1024, 3488, 1, 4096] - - [822, 9381.99] + - [958, 9381.99] - - [1024, 3616, 1, 4096] - - [805, 8709.33] + - [941, 8709.33] - - [1024, 3426, 1, 4096] - - [823, 9229.43] + - [959, 9229.43] - - [4096, 3357, 1, 1024] - - [815, 9668.5] + - [951, 9668.5] - - [4096, 3406, 1, 1024] - - [800, 9748.57] + - [936, 9748.57] - - [1024, 3046, 1, 4096] - - [805, 9590.43] + - [941, 9590.43] - - [1024, 3272, 1, 4096] - - [816, 8930.2] + - [952, 8930.2] - - [1024, 3256, 1, 4096] - - [801, 8828.16] + - [937, 8828.16] - - [4096, 3247, 1, 1024] - - [803, 9741.81] + - [939, 9741.81] - - [4096, 3088, 1, 1024] - - [815, 9589.07] + - [951, 9589.07] - - [1024, 3531, 1, 4096] - - [822, 9501.06] + - [958, 9501.06] - - [64, 160, 400, 160] - - [858, 7334.03] + - [994, 7334.03] - - [4096, 3511, 1, 1024] - - [804, 9789.38] + - [940, 9789.38] - - [1024, 3720, 1, 33708] - - [813, 9214.68] + - [949, 9214.68] - - [1024, 3267, 1, 4096] - - [816, 8831.04] + - [952, 8831.04] - - [1024, 3270, 1, 4096] - - [817, 8876.68] + - [953, 8876.68] - - [1024, 3461, 1, 4096] - - [822, 9327.55] + - [958, 9327.55] - - [4096, 3474, 1, 1024] - - [803, 9697.04] + - [939, 9697.04] - - [4096, 2984, 1, 1024] - - [804, 9674.08] + - [940, 9674.08] - - [1024, 3399, 1, 4096] - - [822, 9158.58] + - [958, 9158.58] - - [4096, 3574, 1, 1024] - - [803, 9942.3] + - [939, 9942.3] - - [1024, 3876, 1, 1024] - - [817, 9085.13] + - [953, 9085.13] - - [4096, 3337, 1, 1024] - - [800, 9611.43] + - [936, 9611.43] - - [4096, 3450, 1, 1024] - - [815, 9930.35] + - [951, 9930.35] - - [1024, 3720, 1, 1024] - - [801, 8755.49] + - [937, 8755.49] - - [1024, 4059, 1, 1024] - - [806, 9366.67] + - [942, 9366.67] - - [4096, 3291, 1, 1024] - - [803, 9856.33] + - [939, 9856.33] - - [64, 93, 688, 93] - - [847, 5497.11] + - [983, 5497.11] - - [4096, 3995, 1, 1024] - - [803, 9776.67] + - [939, 9776.67] - - [64, 147, 432, 147] - - [833, 6233.88] + - [969, 6233.88] - - [4096, 3491, 1, 1024] - - [803, 9742.94] + - [939, 9742.94] - - [4096, 3348, 1, 1024] - - [815, 9634.11] + - [951, 9634.11] - - [4096, 3925, 1, 1024] - - [814, 9848.54] + - [950, 9848.54] - - [4096, 3894, 1, 1024] - - [814, 9812.55] + - [950, 9812.55] - - [1024, 3456, 1, 4096] - - [823, 9317.91] + - [959, 9317.91] - - [1024, 3394, 1, 4096] - - [822, 9148.86] + - [958, 9148.86] - - [64, 100, 624, 102] - - [844, 5416.95] + - [980, 5416.95] - - [4096, 3165, 1, 1024] - - [814, 9743.35] + - [950, 9743.35] - - [4096, 3470, 1, 1024] - - [804, 9691.04] + - [940, 9691.04] - - [1024, 3014, 1, 4096] - - [805, 9486.26] + - [941, 9486.26] - - [1024, 3375, 1, 4096] - - [823, 9082.71] + - [959, 9082.71] - - [4096, 3859, 1, 1024] - - [814, 9738.87] + - [950, 9738.87] - - [4096, 3365, 1, 1024] - - [815, 9694.74] + - [951, 9694.74] - - [1024, 3162, 1, 4096] - - [816, 8550.31] + - [952, 8550.31] - - [1024, 3840, 1, 33708] - - [813, 9409.08] + - [949, 9409.08] - - [1024, 3437, 1, 4096] - - [823, 9270.49] + - [959, 9270.49] - - [4096, 3319, 1, 1024] - - [804, 9927.15] + - [940, 9927.15] - - [1024, 3320, 1, 4096] - - [823, 8962.29] + - [959, 8962.29] - - [64, 23, 2720, 23] - - [846, 2569.53] + - [982, 2569.53] - - [4096, 3328, 1, 1024] - - [803, 9997.41] + - [939, 9997.41] - - [1024, 3235, 1, 4096] - - [823, 8724.31] + - [959, 8724.31] - - [4096, 3282, 1, 1024] - - [804, 9827.13] + - [940, 9827.13] - - [1024, 3367, 1, 4096] - - [816, 9084.02] + - [952, 9084.02] - - [1024, 3542, 1, 4096] - - [823, 9533.1] + - [959, 9533.1] - - [64, 177, 352, 177] - - [809, 6817.91] + - [945, 6817.91] - - [4096, 3145, 1, 1024] - - [800, 9710.28] + - [936, 9710.28] - - [4096, 3514, 1, 1024] - - [803, 9793.06] + - [939, 9793.06] - - [1024, 3432, 1, 4096] - - [823, 9249.39] + - [959, 9249.39] - - [4096, 3409, 1, 1024] - - [799, 9721.6] + - [935, 9721.6] - - [1024, 4012, 1, 33708] - - [803, 9773.35] + - [939, 9773.35] - - [4096, 3876, 1, 1024] - - [800, 9745.65] + - [936, 9745.65] - - [4096, 3299, 1, 1024] - - [803, 9873.53] + - [939, 9873.53] - - [1024, 3168, 1, 4096] - - [816, 8597.13] + - [952, 8597.13] - - [4096, 3681, 1, 1024] - - [815, 9840.03] + - [951, 9840.03] - - [4096, 3531, 1, 1024] - - [804, 9847.76] + - [940, 9847.76] - - [4096, 3388, 1, 1024] - - [815, 9772.28] + - [951, 9772.28] - - [1024, 3720, 1, 4096] - - [804, 8951.6] + - [940, 8951.6] - - [1024, 3332, 1, 4096] - - [823, 8978.97] + - [959, 8978.97] - - [1024, 3273, 1, 4096] - - [817, 8982.49] + - [953, 8982.49] - - [1024, 2935, 1, 4096] - - [806, 9224.89] + - [942, 9224.89] - - [1024, 3467, 1, 4096] - - [820, 9329.33] + - [956, 9329.33] - - [4096, 3542, 1, 1024] - - [803, 9858.51] + - [939, 9858.51] - - [1024, 3130, 1, 4096] - - [802, 8526.66] + - [938, 8526.66] - - [1024, 3405, 1, 4096] - - [823, 9163.44] + - [959, 9163.44] - - [1024, 3960, 1, 1024] - - [801, 9280.36] + - [937, 9280.36] - - [4096, 3405, 1, 1024] - - [814, 9710.2] + - [950, 9710.2] - - [512, 512, 1, 1024] - - [1000, 6670.96] + - [1136, 6670.96] - - [8, 500, 1, 512] - - [896, 228.671] + - [1032, 228.671] - - [512, 512, 1, 2000] - - [1033, 7629.44] + - [1169, 7629.44] - - [32, 512, 1, 512] - - [893, 904.045] + - [1029, 904.045] - - [100, 1024, 1, 2048] - - [955, 3196.98] + - [1091, 3196.98] - - [8, 512, 1, 500] - - [886, 237.137] + - [1022, 237.137] - - [8, 500, 1, 1024] - - [950, 289.366] + - [1086, 289.366] - - [100, 2000, 1, 1024] - - [989, 3368.52] + - [1125, 3368.52] - - [64, 1024, 1, 100] - - [888, 941.709] + - [1024, 941.709] - - [64, 1024, 1, 500] - - [1015, 2659.84] + - [1151, 2659.84] - - [64, 1024, 1, 1024] - - [953, 2452.91] + - [1089, 2452.91] - - [128, 2000, 1, 100] - - [1009, 2560.1] + - [1145, 2560.1] - - [2, 500, 1, 2048] - - [950, 72.2127] + - [1086, 72.2127] - - [16, 512, 1, 10] - - [864, 18.3857] + - [1000, 18.3857] - - [64, 2000, 1, 1024] - - [1020, 2800.78] + - [1156, 2800.78] - - [100, 1024, 1, 1024] - - [948, 3034.17] + - [1084, 3034.17] - - [8, 512, 1, 10] - - [926, 9.24286] + - [1062, 9.24286] - - [16, 500, 1, 2048] - - [950, 565.846] + - [1086, 565.846] - - [10, 100, 1, 500] - - [886, 58.5112] + - [1022, 58.5112] - - [16, 100, 1, 10] - - [926, 3.67143] + - [1062, 3.67143] - - [500, 1024, 1, 512] - - [1016, 6514.61] + - [1152, 6514.61] - - [128, 1024, 1, 512] - - [1034, 4194.4] + - [1170, 4194.4] - - [512, 500, 1, 2000] - - [992, 7347.98] + - [1128, 7347.98] - - [2, 100, 1, 2000] - - [886, 20.9333] + - [1022, 20.9333] - - [500, 512, 1, 100] - - [1008, 2539.78] + - [1144, 2539.78] - - [100, 1024, 1, 500] - - [1034, 3216.18] + - [1170, 3216.18] - - [256, 100, 1, 2048] - - [1044, 1689.17] + - [1180, 1689.17] - - [2, 512, 1, 512] - - [900, 50.5123] + - [1036, 50.5123] - - [128, 2000, 1, 512] - - [1020, 4641.46] + - [1156, 4641.46] - - [2, 100, 1, 10] - - [864, 0.496825] + - [1000, 0.496825] - - [16, 2000, 1, 2048] - - [908, 1266.25] + - [1044, 1266.25] - - [200, 100, 1, 100] - - [1054, 316.556] + - [1190, 316.556] - - [256, 1024, 1, 100] - - [1010, 2686.0] + - [1146, 2686.0] - - [200, 500, 1, 1024] - - [1059, 3282.15] + - [1195, 3282.15] - - [500, 100, 1, 100] - - [973, 631.413] + - [1109, 631.413] - - [4, 100, 1, 10] - - [871, 0.977193] + - [1007, 0.977193] - - [32, 100, 1, 512] - - [950, 198.935] + - [1086, 198.935] - - [100, 2000, 1, 512] - - [1020, 3832.44] + - [1156, 3832.44] - - [16, 1024, 1, 512] - - [934, 794.476] + - [1070, 794.476] - - [200, 512, 1, 100] - - [1052, 1306.22] + - [1188, 1306.22] - - [4, 1024, 1, 1024] - - [893, 213.225] + - [1029, 213.225] - - [512, 1024, 1, 512] - - [1017, 7049.35] + - [1153, 7049.35] - - [4, 512, 1, 10] - - [925, 4.59123] + - [1061, 4.59123] - - [2, 2048, 1, 2000] - - [886, 300.393] + - [1022, 300.393] - - [64, 2048, 1, 10] - - [1046, 241.041] + - [1182, 241.041] - - [128, 100, 1, 10] - - [1051, 27.6862] + - [1187, 27.6862] - - [4, 512, 1, 2048] - - [886, 146.549] + - [1022, 146.549] - - [64, 2048, 1, 500] - - [1026, 4015.79] + - [1162, 4015.79] - - [512, 512, 1, 512] - - [981, 6123.17] + - [1117, 6123.17] - - [500, 500, 1, 2000] - - [992, 7126.67] + - [1128, 7126.67] - - [10, 1024, 1, 2000] - - [959, 807.671] + - [1095, 807.671] - - [256, 100, 1, 100] - - [971, 296.396] + - [1107, 296.396] - - [32, 2000, 1, 2048] - - [914, 2167.3] + - [1050, 2167.3] - - [64, 1024, 1, 2048] - - [947, 2383.23] + - [1083, 2383.23] - - [200, 2048, 1, 512] - - [1022, 5264.04] + - [1158, 5264.04] - - [256, 500, 1, 10] - - [1004, 210.626] + - [1140, 210.626] - - [16, 1024, 1, 100] - - [884, 262.664] + - [1020, 262.664] - - [32, 1024, 1, 1024] - - [889, 1476.97] + - [1025, 1476.97] - - [512, 500, 1, 512] - - [978, 5851.53] + - [1114, 5851.53] - - [128, 1024, 1, 2000] - - [1062, 5516.6] + - [1198, 5516.6] - - [8, 100, 1, 500] - - [886, 46.3963] + - [1022, 46.3963] - - [100, 2000, 1, 2048] - - [1041, 3715.63] + - [1177, 3715.63] - - [10, 512, 1, 512] - - [896, 292.671] + - [1032, 292.671] - - [8, 500, 1, 10] - - [925, 8.87193] + - [1061, 8.87193] - - [10, 2000, 1, 1024] - - [939, 640.1] + - [1075, 640.1] - - [16, 1024, 1, 10] - - [924, 36.6714] + - [1060, 36.6714] - - [16, 512, 1, 2048] - - [903, 585.897] + - [1039, 585.897] - - [256, 512, 1, 10] - - [969, 230.861] + - [1105, 230.861] - - [2, 2000, 1, 100] - - [931, 64.2026] + - [1067, 64.2026] - - [128, 512, 1, 2048] - - [898, 3106.99] + - [1034, 3106.99] - - [128, 512, 1, 100] - - [891, 952.658] + - [1027, 952.658] - - [512, 2000, 1, 1024] - - [988, 8066.07] + - [1124, 8066.07] - - [64, 500, 1, 2048] - - [1057, 1857.7] + - [1193, 1857.7] - - [64, 2000, 1, 2048] - - [1039, 3442.12] + - [1175, 3442.12] - - [64, 2048, 1, 512] - - [1040, 3315.76] + - [1176, 3315.76] - - [10, 2000, 1, 512] - - [886, 785.376] + - [1022, 785.376] - - [32, 2000, 1, 500] - - [889, 2500.1] + - [1025, 2500.1] - - [64, 2000, 1, 10] - - [877, 231.984] + - [1013, 231.984] - - [500, 100, 1, 10] - - [974, 88.1282] + - [1110, 88.1282] - - [128, 1024, 1, 500] - - [1025, 4096.1] + - [1161, 4096.1] - - [64, 100, 1, 2048] - - [886, 587.34] + - [1022, 587.34] - - [64, 100, 1, 10] - - [1045, 12.0403] + - [1181, 12.0403] - - [16, 512, 1, 500] - - [896, 461.361] + - [1032, 461.361] - - [32, 2000, 1, 1024] - - [883, 1713.91] + - [1019, 1713.91] - - [200, 512, 1, 1024] - - [1062, 3244.46] + - [1198, 3244.46] - - [128, 2048, 1, 10] - - [878, 455.211] + - [1014, 455.211] - - [200, 100, 1, 2000] - - [886, 1462.09] + - [1022, 1462.09] - - [2, 100, 1, 512] - - [886, 12.5272] + - [1022, 12.5272] - - [64, 2048, 1, 100] - - [1052, 1689.17] + - [1188, 1689.17] - - [32, 512, 1, 100] - - [885, 266.074] + - [1021, 266.074] - - [16, 512, 1, 1024] - - [950, 569.978] + - [1086, 569.978] - - [4, 1024, 1, 512] - - [940, 208.151] + - [1076, 208.151] - - [64, 2000, 1, 100] - - [1052, 1649.58] + - [1188, 1649.58] - - [512, 2048, 1, 512] - - [988, 7849.09] + - [1124, 7849.09] - - [2, 500, 1, 500] - - [874, 53.5188] + - [1010, 53.5188] - - [32, 100, 1, 100] - - [885, 57.2429] + - [1021, 57.2429] - - [100, 500, 1, 2000] - - [889, 2784.06] + - [1025, 2784.06] - - [200, 2000, 1, 100] - - [961, 2994.11] + - [1097, 2994.11] - - [10, 512, 1, 10] - - [921, 11.1345] + - [1057, 11.1345] - - [100, 500, 1, 2048] - - [1061, 2361.72] + - [1197, 2361.72] - - [4, 2048, 1, 500] - - [896, 379.359] + - [1032, 379.359] - - [200, 500, 1, 100] - - [1022, 1288.76] + - [1158, 1288.76] - - [500, 500, 1, 500] - - [978, 5425.45] + - [1114, 5425.45] - - [2, 100, 1, 1024] - - [950, 16.3025] + - [1086, 16.3025] - - [128, 2048, 1, 512] - - [1036, 4699.6] + - [1172, 4699.6] - - [200, 2000, 1, 1024] - - [986, 4621.04] + - [1122, 4621.04] - - [32, 512, 1, 1024] - - [949, 1028.12] + - [1085, 1028.12] - - [100, 2048, 1, 500] - - [1010, 4142.49] + - [1146, 4142.49] - - [256, 100, 1, 1024] - - [1040, 1443.62] + - [1176, 1443.62] - - [16, 2000, 1, 500] - - [935, 1428.67] + - [1071, 1428.67] - - [128, 100, 1, 100] - - [885, 213.433] + - [1021, 213.433] - - [500, 500, 1, 2048] - - [982, 6639.1] + - [1118, 6639.1] - - [32, 512, 1, 10] - - [918, 36.0298] + - [1054, 36.0298] - - [128, 100, 1, 1024] - - [946, 791.598] + - [1082, 791.598] - - [16, 500, 1, 2000] - - [959, 694.544] + - [1095, 694.544] - - [4, 2048, 1, 100] - - [930, 129.72] + - [1066, 129.72] - - [64, 500, 1, 500] - - [872, 1333.43] + - [1008, 1333.43] - - [500, 1024, 1, 2048] - - [991, 7031.86] + - [1127, 7031.86] - - [512, 2048, 1, 100] - - [966, 5285.26] + - [1102, 5285.26] - - [128, 512, 1, 1024] - - [1058, 2519.2] + - [1194, 2519.2] - - [128, 512, 1, 2000] - - [1056, 3608.91] + - [1192, 3608.91] - - [128, 2000, 1, 2000] - - [1029, 7017.64] + - [1165, 7017.64] - - [2, 512, 1, 10] - - [922, 2.13175] + - [1058, 2.13175] - - [10, 512, 1, 500] - - [886, 293.678] + - [1022, 293.678] - - [4, 1024, 1, 2000] - - [906, 326.215] + - [1042, 326.215] - - [256, 100, 1, 2000] - - [1043, 1768.06] + - [1179, 1768.06] - - [512, 2048, 1, 2000] - - [988, 8674.62] + - [1124, 8674.62] - - [100, 100, 1, 10] - - [1050, 21.6517] + - [1186, 21.6517] - - [256, 500, 1, 1024] - - [990, 4833.14] + - [1126, 4833.14] - - [128, 512, 1, 10] - - [878, 132.229] + - [1014, 132.229] - - [256, 100, 1, 500] - - [1037, 914.386] + - [1173, 914.386] - - [64, 100, 1, 512] - - [944, 369.109] + - [1080, 369.109] - - [64, 512, 1, 500] - - [886, 1600.1] + - [1022, 1600.1] - - [64, 2048, 1, 2000] - - [1040, 5925.6] + - [1176, 5925.6] - - [100, 2048, 1, 1024] - - [998, 3260.6] + - [1134, 3260.6] - - [200, 2000, 1, 10] - - [878, 595.338] + - [1014, 595.338] - - [128, 1024, 1, 100] - - [1022, 1689.17] + - [1158, 1689.17] - - [16, 2000, 1, 100] - - [885, 493.927] + - [1021, 493.927] - - [8, 100, 1, 512] - - [886, 49.8087] + - [1022, 49.8087] - - [500, 2048, 1, 1024] - - [988, 7651.71] + - [1124, 7651.71] - - [500, 2000, 1, 10] - - [976, 1008.16] + - [1112, 1008.16] - - [32, 100, 1, 500] - - [950, 187.016] + - [1086, 187.016] - - [256, 1024, 1, 2048] - - [991, 6190.95] + - [1127, 6190.95] - - [32, 500, 1, 2048] - - [886, 1083.7] + - [1022, 1083.7] - - [4, 2000, 1, 10] - - [929, 17.6439] + - [1065, 17.6439] - - [128, 500, 1, 2000] - - [946, 3516.58] + - [1082, 3516.58] - - [8, 1024, 1, 10] - - [920, 18.0649] + - [1056, 18.0649] - - [2, 500, 1, 100] - - [865, 16.1256] + - [1001, 16.1256] - - [10, 500, 1, 512] - - [886, 291.009] + - [1022, 291.009] - - [10, 2000, 1, 10] - - [864, 38.5615] + - [1000, 38.5615] - - [500, 512, 1, 512] - - [981, 5893.63] + - [1117, 5893.63] - - [32, 500, 1, 500] - - [886, 892.957] + - [1022, 892.957] - - [256, 500, 1, 2000] - - [995, 6237.92] + - [1131, 6237.92] - - [100, 500, 1, 100] - - [897, 726.844] + - [1033, 726.844] - - [500, 2048, 1, 100] - - [970, 4867.02] + - [1106, 4867.02] - - [10, 1024, 1, 512] - - [886, 520.227] + - [1022, 520.227] - - [2, 2048, 1, 512] - - [896, 151.628] + - [1032, 151.628] - - [256, 512, 1, 100] - - [975, 1590.78] + - [1111, 1590.78] - - [10, 2048, 1, 100] - - [886, 324.151] + - [1022, 324.151] - - [8, 2048, 1, 100] - - [941, 256.1] + - [1077, 256.1] - - [512, 100, 1, 512] - - [1037, 2100.61] + - [1173, 2100.61] - - [4, 500, 1, 500] - - [886, 115.841] + - [1022, 115.841] - - [64, 100, 1, 1024] - - [886, 450.21] + - [1022, 450.21] - - [2, 2048, 1, 1024] - - [943, 137.708] + - [1079, 137.708] - - [2, 500, 1, 2000] - - [912, 90.3527] + - [1048, 90.3527] - - [512, 1024, 1, 500] - - [1017, 6898.63] + - [1153, 6898.63] - - [128, 2000, 1, 500] - - [1022, 5161.39] + - [1158, 5161.39] - - [32, 512, 1, 2048] - - [956, 1103.86] + - [1092, 1103.86] - - [10, 100, 1, 2000] - - [886, 106.032] + - [1022, 106.032] - - [4, 100, 1, 512] - - [886, 24.7154] + - [1022, 24.7154] - - [2, 512, 1, 2048] - - [950, 73.3246] + - [1086, 73.3246] - - [200, 512, 1, 2048] - - [1062, 3954.01] + - [1198, 3954.01] - - [200, 2000, 1, 2000] - - [1024, 6230.63] + - [1160, 6230.63] - - [100, 100, 1, 2000] - - [886, 827.915] + - [1022, 827.915] - - [500, 2048, 1, 2000] - - [987, 8388.04] + - [1123, 8388.04] - - [64, 2048, 1, 2048] - - [1032, 3406.64] + - [1168, 3406.64] - - [16, 2000, 1, 1024] - - [892, 1024.1] + - [1028, 1024.1] - - [512, 2048, 1, 1024] - - [965, 8061.22] + - [1101, 8061.22] - - [10, 500, 1, 500] - - [896, 284.191] + - [1032, 284.191] - - [200, 1024, 1, 2048] - - [1060, 4886.29] + - [1196, 4886.29] - - [10, 2000, 1, 2000] - - [886, 1449.38] + - [1022, 1449.38] - - [8, 2000, 1, 500] - - [935, 719.524] + - [1071, 719.524] - - [2, 100, 1, 2048] - - [950, 19.945] + - [1086, 19.945] - - [32, 100, 1, 2048] - - [950, 323.894] + - [1086, 323.894] - - [512, 512, 1, 10] - - [1007, 420.203] + - [1143, 420.203] - - [512, 500, 1, 10] - - [1012, 376.571] + - [1148, 376.571] - - [16, 100, 1, 1024] - - [896, 129.72] + - [1032, 129.72] - - [2, 500, 1, 10] - - [860, 2.21864] + - [996, 2.21864] - - [200, 512, 1, 10] - - [862, 188.335] + - [998, 188.335] - - [512, 1024, 1, 100] - - [962, 3877.97] + - [1098, 3877.97] - - [16, 2000, 1, 2000] - - [886, 2222.32] + - [1022, 2222.32] - - [500, 500, 1, 1024] - - [982, 6130.37] + - [1118, 6130.37] - - [500, 100, 1, 2048] - - [1037, 2949.41] + - [1173, 2949.41] - - [256, 1024, 1, 512] - - [1001, 5886.84] + - [1137, 5886.84] - - [256, 500, 1, 512] - - [979, 4380.85] + - [1115, 4380.85] - - [16, 1024, 1, 2000] - - [950, 1208.36] + - [1086, 1208.36] - - [200, 500, 1, 2048] - - [1062, 3855.52] + - [1198, 3855.52] - - [256, 2000, 1, 10] - - [964, 727.373] + - [1100, 727.373] - - [10, 2048, 1, 2048] - - [917, 823.158] + - [1053, 823.158] - - [512, 2000, 1, 100] - - [966, 5120.1] + - [1102, 5120.1] - - [10, 1024, 1, 1024] - - [893, 553.146] + - [1029, 553.146] - - [512, 2000, 1, 2048] - - [994, 7563.4] + - [1130, 7563.4] - - [500, 1024, 1, 500] - - [1018, 6570.94] + - [1154, 6570.94] - - [500, 100, 1, 512] - - [1037, 2038.32] + - [1173, 2038.32] - - [256, 2000, 1, 100] - - [986, 3764.81] + - [1122, 3764.81] - - [512, 1024, 1, 2048] - - [1030, 7286.62] + - [1166, 7286.62] - - [32, 512, 1, 500] - - [886, 898.346] + - [1022, 898.346] - - [100, 2000, 1, 10] - - [878, 333.433] + - [1014, 333.433] - - [100, 500, 1, 512] - - [1056, 2176.97] + - [1192, 2176.97] - - [8, 2000, 1, 512] - - [935, 602.453] + - [1071, 602.453] - - [100, 2048, 1, 2048] - - [1042, 3694.87] + - [1178, 3694.87] - - [128, 1024, 1, 2048] - - [1061, 4168.35] + - [1197, 4168.35] - - [8, 500, 1, 2000] - - [960, 352.213] + - [1096, 352.213] - - [100, 2000, 1, 500] - - [1010, 4045.41] + - [1146, 4045.41] - - [100, 2048, 1, 100] - - [1010, 2081.4] + - [1146, 2081.4] - - [4, 100, 1, 1024] - - [886, 33.1323] + - [1022, 33.1323] - - [500, 2048, 1, 2048] - - [994, 7765.03] + - [1130, 7765.03] - - [2, 2000, 1, 2048] - - [905, 166.334] + - [1041, 166.334] - - [200, 2048, 1, 10] - - [879, 609.624] + - [1015, 609.624] - - [2, 500, 1, 1024] - - [950, 75.3941] + - [1086, 75.3941] - - [100, 500, 1, 1024] - - [946, 1975.41] + - [1082, 1975.41] - - [16, 2048, 1, 500] - - [886, 1473.48] + - [1022, 1473.48] - - [100, 1024, 1, 10] - - [1046, 185.607] + - [1182, 185.607] - - [8, 2048, 1, 1024] - - [942, 543.404] + - [1078, 543.404] - - [2, 2000, 1, 500] - - [886, 179.956] + - [1022, 179.956] - - [32, 100, 1, 1024] - - [886, 267.812] + - [1022, 267.812] - - [500, 2000, 1, 512] - - [1016, 7087.59] + - [1152, 7087.59] - - [64, 100, 1, 2000] - - [896, 615.485] + - [1032, 615.485] - - [100, 1024, 1, 2000] - - [1059, 4224.52] + - [1195, 4224.52] - - [64, 500, 1, 10] - - [861, 63.5921] + - [997, 63.5921] - - [32, 2048, 1, 100] - - [882, 941.709] + - [1018, 941.709] - - [64, 500, 1, 512] - - [886, 1575.48] + - [1022, 1575.48] - - [10, 100, 1, 1024] - - [896, 82.6806] + - [1032, 82.6806] - - [16, 512, 1, 100] - - [885, 148.506] + - [1021, 148.506] - - [4, 100, 1, 2000] - - [959, 43.9597] + - [1095, 43.9597] - - [2, 512, 1, 1024] - - [950, 74.152] + - [1086, 74.152] - - [64, 512, 1, 1024] - - [951, 1571.0] + - [1087, 1571.0] - - [10, 2048, 1, 500] - - [886, 920.963] + - [1022, 920.963] - - [4, 2000, 1, 2048] - - [905, 326.215] + - [1041, 326.215] - - [512, 100, 1, 2048] - - [1040, 3084.15] + - [1176, 3084.15] - - [32, 100, 1, 2000] - - [886, 343.448] + - [1022, 343.448] - - [256, 512, 1, 500] - - [979, 4311.68] + - [1115, 4311.68] - - [100, 2000, 1, 100] - - [1010, 2016.23] + - [1146, 2016.23] - - [8, 2000, 1, 1024] - - [899, 544.781] + - [1035, 544.781] - - [4, 512, 1, 500] - - [886, 118.619] + - [1022, 118.619] - - [128, 1024, 1, 10] - - [1049, 244.637] + - [1185, 244.637] - - [4, 500, 1, 1024] - - [886, 144.733] + - [1022, 144.733] - - [32, 2048, 1, 512] - - [889, 2140.05] + - [1025, 2140.05] - - [32, 100, 1, 10] - - [864, 7.11754] + - [1000, 7.11754] - - [100, 2048, 1, 10] - - [1053, 341.433] + - [1189, 341.433] - - [512, 500, 1, 100] - - [1014, 2461.64] + - [1150, 2461.64] - - [128, 2000, 1, 1024] - - [998, 4174.37] + - [1134, 4174.37] - - [200, 1024, 1, 500] - - [1010, 4295.4] + - [1146, 4295.4] - - [32, 2048, 1, 1024] - - [913, 1667.82] + - [1049, 1667.82] - - [10, 1024, 1, 2048] - - [904, 555.49] + - [1040, 555.49] - - [8, 500, 1, 100] - - [885, 71.5286] + - [1021, 71.5286] - - [32, 2048, 1, 500] - - [889, 2528.5] + - [1025, 2528.5] - - [200, 100, 1, 1024] - - [898, 1071.23] + - [1034, 1071.23] - - [16, 100, 1, 100] - - [875, 28.6714] + - [1011, 28.6714] - - [8, 1024, 1, 2000] - - [959, 654.413] + - [1095, 654.413] - - [4, 512, 1, 100] - - [885, 36.6714] + - [1021, 36.6714] - - [16, 500, 1, 100] - - [885, 142.957] + - [1021, 142.957] - - [8, 1024, 1, 2048] - - [911, 441.606] + - [1047, 441.606] - - [16, 1024, 1, 2048] - - [912, 886.845] + - [1048, 886.845] - - [10, 2048, 1, 1024] - - [890, 639.476] + - [1026, 639.476] - - [64, 512, 1, 100] - - [885, 518.581] + - [1021, 518.581] - - [2, 100, 1, 500] - - [886, 9.71538] + - [1022, 9.71538] - - [2, 500, 1, 512] - - [892, 48.2203] + - [1028, 48.2203] - - [256, 512, 1, 2000] - - [995, 6450.49] + - [1131, 6450.49] - - [128, 500, 1, 1024] - - [889, 2497.66] + - [1025, 2497.66] - - [10, 100, 1, 10] - - [926, 2.33214] + - [1062, 2.33214] - - [8, 2048, 1, 2048] - - [876, 643.398] + - [1012, 643.398] - - [16, 2048, 1, 2048] - - [916, 1338.0] + - [1052, 1338.0] - - [64, 1024, 1, 10] - - [879, 132.229] + - [1015, 132.229] - - [500, 100, 1, 500] - - [1037, 1941.09] + - [1173, 1941.09] - - [256, 1024, 1, 2000] - - [1033, 7629.44] + - [1169, 7629.44] - - [200, 512, 1, 500] - - [1022, 3232.42] + - [1158, 3232.42] - - [8, 2000, 1, 10] - - [923, 32.3581] + - [1059, 32.3581] - - [64, 2000, 1, 512] - - [1021, 3225.3] + - [1157, 3225.3] - - [2, 512, 1, 100] - - [865, 16.7234] + - [1001, 16.7234] - - [4, 2000, 1, 2000] - - [886, 586.61] + - [1022, 586.61] - - [200, 1024, 1, 100] - - [1010, 2133.43] + - [1146, 2133.43] - - [16, 100, 1, 500] - - [950, 92.6926] + - [1086, 92.6926] - - [128, 100, 1, 500] - - [946, 526.416] + - [1082, 526.416] - - [500, 1024, 1, 1024] - - [980, 7201.86] + - [1116, 7201.86] - - [200, 1024, 1, 1024] - - [1032, 4519.82] + - [1168, 4519.82] - - [8, 2048, 1, 512] - - [896, 624.252] + - [1032, 624.252] - - [200, 2000, 1, 500] - - [986, 5186.82] + - [1122, 5186.82] - - [512, 100, 1, 1024] - - [1037, 2742.19] + - [1173, 2742.19] - - [16, 100, 1, 2000] - - [896, 168.876] + - [1032, 168.876] - - [500, 512, 1, 2000] - - [1033, 7289.39] + - [1169, 7289.39] - - [8, 2000, 1, 2048] - - [907, 668.289] + - [1043, 668.289] - - [256, 2048, 1, 100] - - [968, 3924.41] + - [1104, 3924.41] - - [32, 2048, 1, 2000] - - [900, 3882.56] + - [1036, 3882.56] - - [200, 500, 1, 512] - - [1025, 3368.52] + - [1161, 3368.52] - - [10, 512, 1, 100] - - [885, 91.5286] + - [1021, 91.5286] - - [16, 2000, 1, 10] - - [863, 61.6385] + - [999, 61.6385] - - [8, 512, 1, 100] - - [885, 72.2127] + - [1021, 72.2127] - - [256, 512, 1, 512] - - [990, 4584.04] + - [1126, 4584.04] - - [500, 2000, 1, 1024] - - [965, 7569.59] + - [1101, 7569.59] - - [512, 512, 1, 500] - - [981, 5708.81] + - [1117, 5708.81] - - [256, 2048, 1, 1024] - - [1005, 5923.21] + - [1141, 5923.21] - - [8, 2048, 1, 2000] - - [886, 1153.9] + - [1022, 1153.9] - - [100, 512, 1, 2048] - - [952, 2383.23] + - [1088, 2383.23] - - [100, 1024, 1, 512] - - [1037, 3343.77] + - [1173, 3343.77] - - [128, 100, 1, 2000] - - [1055, 1084.85] + - [1191, 1084.85] - - [4, 2048, 1, 2048] - - [904, 332.454] + - [1040, 332.454] - - [2, 1024, 1, 2000] - - [915, 161.106] + - [1051, 161.106] - - [100, 512, 1, 512] - - [889, 2184.63] + - [1025, 2184.63] - - [128, 1024, 1, 1024] - - [1032, 3848.09] + - [1168, 3848.09] - - [200, 2048, 1, 1024] - - [967, 4547.26] + - [1103, 4547.26] - - [32, 1024, 1, 2000] - - [896, 2416.62] + - [1032, 2416.62] - - [128, 500, 1, 100] - - [891, 919.64] + - [1027, 919.64] - - [200, 512, 1, 2000] - - [1059, 4238.51] + - [1195, 4238.51] - - [10, 2048, 1, 2000] - - [896, 1454.65] + - [1032, 1454.65] - - [256, 1024, 1, 500] - - [993, 5669.3] + - [1129, 5669.3] - - [100, 100, 1, 100] - - [885, 171.333] + - [1021, 171.333] - - [8, 512, 1, 1024] - - [954, 286.596] + - [1090, 286.596] - - [200, 1024, 1, 512] - - [1010, 4354.65] + - [1146, 4354.65] - - [256, 500, 1, 500] - - [995, 4020.2] + - [1131, 4020.2] - - [200, 100, 1, 500] - - [1059, 702.347] + - [1195, 702.347] - - [2, 1024, 1, 2048] - - [905, 112.85] + - [1041, 112.85] - - [256, 500, 1, 2048] - - [995, 5041.33] + - [1131, 5041.33] - - [512, 2048, 1, 500] - - [988, 7710.22] + - [1124, 7710.22] - - [512, 100, 1, 2000] - - [1037, 3099.37] + - [1173, 3099.37] - - [512, 500, 1, 1024] - - [996, 6463.22] + - [1132, 6463.22] - - [16, 512, 1, 2000] - - [912, 721.227] + - [1048, 721.227] - - [64, 500, 1, 1024] - - [951, 1528.46] + - [1087, 1528.46] - - [512, 2000, 1, 10] - - [972, 1174.41] + - [1108, 1174.41] - - [256, 512, 1, 1024] - - [990, 4978.5] + - [1126, 4978.5] - - [10, 512, 1, 1024] - - [950, 370.36] + - [1086, 370.36] - - [512, 100, 1, 100] - - [973, 659.894] + - [1109, 659.894] - - [8, 2000, 1, 100] - - [885, 256.51] + - [1021, 256.51] - - [128, 2048, 1, 1024] - - [998, 4173.54] + - [1134, 4173.54] - - [2, 2000, 1, 2000] - - [886, 250.727] + - [1022, 250.727] - - [16, 2048, 1, 1024] - - [933, 1046.06] + - [1069, 1046.06] - - [500, 512, 1, 500] - - [978, 5517.34] + - [1114, 5517.34] - - [8, 100, 1, 1024] - - [951, 64.1] + - [1087, 64.1] - - [10, 100, 1, 100] - - [875, 17.9571] + - [1011, 17.9571] - - [200, 500, 1, 500] - - [1025, 3140.8] + - [1161, 3140.8] - - [10, 500, 1, 2000] - - [912, 444.94] + - [1048, 444.94] - - [500, 100, 1, 2000] - - [1040, 2969.22] + - [1176, 2969.22] - - [100, 512, 1, 2000] - - [952, 2776.67] + - [1088, 2776.67] - - [500, 1024, 1, 2000] - - [1031, 8020.15] + - [1167, 8020.15] - - [32, 2000, 1, 2000] - - [892, 3827.85] + - [1028, 3827.85] - - [64, 1024, 1, 512] - - [1056, 2573.29] + - [1192, 2573.29] - - [64, 2000, 1, 2000] - - [1025, 5797.2] + - [1161, 5797.2] - - [32, 500, 1, 100] - - [885, 266.767] + - [1021, 266.767] - - [128, 2000, 1, 2048] - - [1041, 4548.05] + - [1177, 4548.05] - - [10, 100, 1, 2048] - - [950, 98.5615] + - [1086, 98.5615] - - [32, 2048, 1, 2048] - - [913, 2213.45] + - [1049, 2213.45] - - [64, 100, 1, 100] - - [886, 96.4855] + - [1022, 96.4855] - - [2, 1024, 1, 100] - - [936, 34.6946] + - [1072, 34.6946] - - [256, 1024, 1, 10] - - [1006, 425.658] + - [1142, 425.658] - - [256, 1024, 1, 1024] - - [999, 5482.85] + - [1135, 5482.85] - - [64, 500, 1, 2000] - - [886, 2056.66] + - [1022, 2056.66] - - [512, 2000, 1, 512] - - [984, 7550.33] + - [1120, 7550.33] - - [8, 512, 1, 512] - - [893, 232.086] + - [1029, 232.086] - - [8, 512, 1, 2048] - - [886, 290.564] + - [1022, 290.564] - - [100, 100, 1, 1024] - - [1056, 624.49] + - [1192, 624.49] - - [2, 2048, 1, 10] - - [929, 8.92759] + - [1065, 8.92759] - - [4, 2048, 1, 512] - - [935, 312.176] + - [1071, 312.176] - - [4, 2048, 1, 10] - - [928, 18.0649] + - [1064, 18.0649] - - [8, 100, 1, 2000] - - [905, 85.9369] + - [1041, 85.9369] - - [2, 1024, 1, 1024] - - [902, 101.314] + - [1038, 101.314] - - [16, 2048, 1, 100] - - [886, 518.581] + - [1022, 518.581] - - [16, 512, 1, 512] - - [896, 456.003] + - [1032, 456.003] - - [32, 500, 1, 512] - - [893, 906.295] + - [1029, 906.295] - - [500, 2000, 1, 2000] - - [988, 8143.42] + - [1124, 8143.42] - - [500, 1024, 1, 10] - - [969, 680.951] + - [1105, 680.951] - - [32, 500, 1, 1024] - - [945, 1008.97] + - [1081, 1008.97] - - [32, 500, 1, 10] - - [881, 33.4333] + - [1017, 33.4333] - - [500, 500, 1, 10] - - [1010, 367.747] + - [1146, 367.747] - - [4, 2000, 1, 500] - - [896, 370.47] + - [1032, 370.47] - - [10, 2000, 1, 500] - - [886, 899.381] + - [1022, 899.381] - - [32, 2000, 1, 512] - - [898, 2089.9] + - [1034, 2089.9] - - [256, 500, 1, 100] - - [1011, 1495.43] + - [1147, 1495.43] - - [256, 2048, 1, 10] - - [969, 789.69] + - [1105, 789.69] - - [4, 1024, 1, 500] - - [886, 222.709] + - [1022, 222.709] - - [256, 512, 1, 2048] - - [995, 5292.6] + - [1131, 5292.6] - - [2, 2000, 1, 1024] - - [933, 137.365] + - [1069, 137.365] - - [256, 100, 1, 512] - - [1037, 1085.13] + - [1173, 1085.13] - - [8, 1024, 1, 500] - - [886, 441.479] + - [1022, 441.479] - - [256, 2048, 1, 500] - - [1016, 7031.86] + - [1152, 7031.86] - - [256, 2048, 1, 2048] - - [979, 6771.93] + - [1115, 6771.93] - - [2, 2000, 1, 512] - - [940, 159.106] + - [1076, 159.106] - - [256, 2000, 1, 512] - - [983, 6527.59] + - [1119, 6527.59] - - [4, 1024, 1, 100] - - [932, 70.237] + - [1068, 70.237] - - [512, 1024, 1, 2000] - - [1017, 8295.8] + - [1153, 8295.8] - - [100, 500, 1, 500] - - [889, 2016.23] + - [1025, 2016.23] - - [4, 2048, 1, 1024] - - [937, 285.039] + - [1073, 285.039] - - [2, 1024, 1, 500] - - [886, 109.502] + - [1022, 109.502] - - [64, 100, 1, 500] - - [886, 296.396] + - [1022, 296.396] - - [256, 2000, 1, 2000] - - [994, 8152.97] + - [1130, 8152.97] - - [2, 512, 1, 500] - - [892, 44.8552] + - [1028, 44.8552] - - [8, 2048, 1, 500] - - [886, 736.791] + - [1022, 736.791] - - [10, 1024, 1, 500] - - [886, 547.109] + - [1022, 547.109] - - [4, 2048, 1, 2000] - - [896, 604.23] + - [1032, 604.23] - - [200, 1024, 1, 2000] - - [1063, 5400.94] + - [1199, 5400.94] - - [128, 500, 1, 512] - - [1056, 2730.77] + - [1192, 2730.77] - - [10, 500, 1, 2048] - - [950, 359.651] + - [1086, 359.651] - - [256, 2048, 1, 2000] - - [994, 8375.31] + - [1130, 8375.31] - - [8, 2000, 1, 2000] - - [896, 1146.23] + - [1032, 1146.23] - - [100, 2048, 1, 512] - - [1019, 3936.2] + - [1155, 3936.2] - - [512, 500, 1, 2048] - - [995, 6756.39] + - [1131, 6756.39] - - [200, 2048, 1, 100] - - [986, 3180.22] + - [1122, 3180.22] - - [128, 512, 1, 512] - - [889, 2872.91] + - [1025, 2872.91] - - [200, 2000, 1, 2048] - - [1035, 4818.92] + - [1171, 4818.92] - - [4, 2000, 1, 1024] - - [933, 275.369] + - [1069, 275.369] - - [64, 512, 1, 10] - - [1048, 69.5237] + - [1184, 69.5237] - - [32, 500, 1, 2000] - - [915, 1246.21] + - [1051, 1246.21] - - [128, 2048, 1, 2000] - - [1028, 7233.65] + - [1164, 7233.65] - - [100, 100, 1, 2048] - - [886, 790.223] + - [1022, 790.223] - - [500, 2048, 1, 512] - - [1016, 7249.66] + - [1152, 7249.66] - - [200, 100, 1, 512] - - [892, 748.638] + - [1028, 748.638] - - [32, 2000, 1, 100] - - [887, 930.333] + - [1023, 930.333] - - [500, 512, 1, 2048] - - [1038, 6640.02] + - [1174, 6640.02] - - [500, 2000, 1, 500] - - [1018, 7078.24] + - [1154, 7078.24] - - [200, 100, 1, 2048] - - [896, 1387.63] + - [1032, 1387.63] - - [2, 2048, 1, 100] - - [930, 64.9101] + - [1066, 64.9101] - - [8, 100, 1, 10] - - [871, 1.85439] + - [1007, 1.85439] - - [200, 2048, 1, 2048] - - [1035, 5022.02] + - [1171, 5022.02] - - [200, 2048, 1, 500] - - [986, 5355.75] + - [1122, 5355.75] - - [100, 100, 1, 500] - - [1056, 416.767] + - [1192, 416.767] - - [8, 2048, 1, 10] - - [927, 34.8119] + - [1063, 34.8119] - - [100, 500, 1, 10] - - [867, 93.3836] + - [1003, 93.3836] - - [200, 500, 1, 2000] - - [1059, 4152.92] + - [1195, 4152.92] - - [512, 2000, 1, 500] - - [988, 7485.48] + - [1124, 7485.48] - - [10, 500, 1, 1024] - - [954, 363.736] + - [1090, 363.736] - - [256, 100, 1, 10] - - [1003, 41.1256] + - [1139, 41.1256] - - [500, 512, 1, 1024] - - [982, 6362.82] + - [1118, 6362.82] - - [200, 2048, 1, 2000] - - [1024, 6321.09] + - [1160, 6321.09] - - [100, 1024, 1, 100] - - [1023, 1306.22] + - [1159, 1306.22] - - [500, 1024, 1, 100] - - [962, 3699.52] + - [1098, 3699.52] - - [10, 512, 1, 2048] - - [886, 361.18] + - [1022, 361.18] - - [2, 1024, 1, 512] - - [935, 105.803] + - [1071, 105.803] - - [4, 500, 1, 2048] - - [958, 143.517] + - [1094, 143.517] - - [100, 512, 1, 100] - - [891, 744.286] + - [1027, 744.286] - - [16, 500, 1, 512] - - [886, 453.197] + - [1022, 453.197] - - [10, 1024, 1, 100] - - [884, 166.334] + - [1020, 166.334] - - [8, 1024, 1, 100] - - [932, 140.374] + - [1068, 140.374] - - [64, 2000, 1, 500] - - [1027, 3940.99] + - [1163, 3940.99] - - [64, 1024, 1, 2000] - - [892, 3531.13] + - [1028, 3531.13] - - [10, 100, 1, 512] - - [886, 61.6385] + - [1022, 61.6385] - - [4, 500, 1, 2000] - - [912, 173.11] + - [1048, 173.11] - - [512, 1024, 1, 10] - - [963, 736.46] + - [1099, 736.46] - - [128, 2048, 1, 2048] - - [1026, 4596.6] + - [1162, 4596.6] - - [4, 100, 1, 100] - - [875, 7.24286] + - [1011, 7.24286] - - [32, 1024, 1, 512] - - [935, 1519.78] + - [1071, 1519.78] - - [8, 512, 1, 2000] - - [960, 356.894] + - [1096, 356.894] - - [100, 100, 1, 512] - - [900, 426.767] + - [1036, 426.767] - - [2, 2048, 1, 2048] - - [909, 170.878] + - [1045, 170.878] - - [2, 512, 1, 2000] - - [912, 90.8801] + - [1048, 90.8801] - - [16, 500, 1, 10] - - [885, 18.2818] + - [1021, 18.2818] - - [10, 500, 1, 100] - - [885, 88.1282] + - [1021, 88.1282] - - [4, 100, 1, 500] - - [950, 23.6849] + - [1086, 23.6849] - - [512, 1024, 1, 1024] - - [1002, 7431.87] + - [1138, 7431.87] - - [64, 500, 1, 100] - - [895, 506.429] + - [1031, 506.429] - - [128, 2000, 1, 10] - - [1053, 432.532] + - [1189, 432.532] - - [10, 2000, 1, 2048] - - [916, 806.399] + - [1052, 806.399] - - [2, 100, 1, 100] - - [873, 3.225] + - [1009, 3.225] - - [10, 512, 1, 2000] - - [905, 462.194] + - [1041, 462.194] - - [8, 500, 1, 500] - - [886, 231.581] + - [1022, 231.581] - - [4, 500, 1, 512] - - [886, 118.619] + - [1022, 118.619] - - [10, 500, 1, 10] - - [880, 11.0649] + - [1016, 11.0649] - - [64, 512, 1, 2000] - - [886, 2116.9] + - [1022, 2116.9] - - [500, 512, 1, 10] - - [1007, 395.162] + - [1143, 395.162] - - [200, 512, 1, 512] - - [1025, 3449.36] + - [1161, 3449.36] - - [512, 500, 1, 500] - - [981, 5536.43] + - [1117, 5536.43] - - [32, 512, 1, 2000] - - [896, 1264.3] + - [1032, 1264.3] - - [128, 500, 1, 2048] - - [952, 3006.34] + - [1088, 3006.34] - - [500, 2048, 1, 10] - - [977, 1049.28] + - [1113, 1049.28] - - [512, 512, 1, 100] - - [1014, 2664.16] + - [1150, 2664.16] - - [200, 2000, 1, 512] - - [1022, 5192.8] + - [1158, 5192.8] - - [500, 500, 1, 512] - - [978, 5673.86] + - [1114, 5673.86] - - [128, 2048, 1, 500] - - [1010, 5251.38] + - [1146, 5251.38] - - [4, 512, 1, 512] - - [886, 123.753] + - [1022, 123.753] - - [16, 2048, 1, 2000] - - [902, 2294.78] + - [1038, 2294.78] - - [16, 500, 1, 1024] - - [886, 562.737] + - [1022, 562.737] - - [256, 2000, 1, 500] - - [1016, 6639.1] + - [1152, 6639.1] - - [10, 1024, 1, 10] - - [866, 21.0836] + - [1002, 21.0836] - - [16, 500, 1, 500] - - [886, 446.529] + - [1022, 446.529] - - [10, 2048, 1, 512] - - [884, 784.962] + - [1020, 784.962] - - [200, 500, 1, 10] - - [859, 176.156] + - [995, 176.156] - - [256, 2048, 1, 512] - - [1013, 6540.93] + - [1149, 6540.93] - - [256, 2000, 1, 2048] - - [990, 6670.43] + - [1126, 6670.43] - - [500, 2048, 1, 500] - - [1018, 7264.57] + - [1154, 7264.57] - - [500, 100, 1, 1024] - - [1040, 2700.52] + - [1176, 2700.52] - - [16, 100, 1, 512] - - [950, 96.7038] + - [1086, 96.7038] - - [64, 512, 1, 2048] - - [951, 1868.39] + - [1087, 1868.39] - - [32, 1024, 1, 10] - - [862, 69.5237] + - [998, 69.5237] - - [16, 2048, 1, 512] - - [935, 1226.5] + - [1071, 1226.5] - - [8, 1024, 1, 512] - - [935, 416.202] + - [1071, 416.202] - - [4, 1024, 1, 2048] - - [957, 223.201] + - [1093, 223.201] - - [100, 2048, 1, 2000] - - [1030, 5614.14] + - [1166, 5614.14] - - [512, 512, 1, 2048] - - [995, 6868.97] + - [1131, 6868.97] - - [256, 2000, 1, 1024] - - [986, 5758.98] + - [1122, 5758.98] - - [64, 512, 1, 512] - - [1055, 1651.4] + - [1191, 1651.4] - - [200, 1024, 1, 10] - - [869, 341.433] + - [1005, 341.433] - - [128, 500, 1, 500] - - [898, 2580.75] + - [1034, 2580.75] - - [100, 512, 1, 1024] - - [889, 2041.72] + - [1025, 2041.72] - - [16, 1024, 1, 500] - - [886, 867.897] + - [1022, 867.897] - - [128, 100, 1, 2048] - - [1056, 1011.46] + - [1192, 1011.46] - - [100, 512, 1, 500] - - [889, 2051.38] + - [1025, 2051.38] - - [8, 1024, 1, 1024] - - [902, 424.625] + - [1038, 424.625] - - [2, 2000, 1, 10] - - [928, 8.57458] + - [1064, 8.57458] - - [4, 500, 1, 10] - - [925, 4.56429] + - [1061, 4.56429] - - [500, 2000, 1, 2048] - - [1002, 7444.12] + - [1138, 7444.12] - - [4, 2000, 1, 100] - - [938, 128.305] + - [1074, 128.305] - - [512, 2000, 1, 2000] - - [988, 8454.53] + - [1124, 8454.53] - - [128, 500, 1, 10] - - [1047, 117.747] + - [1183, 117.747] - - [32, 1024, 1, 100] - - [895, 512.1] + - [1031, 512.1] - - [8, 500, 1, 2048] - - [910, 286.935] + - [1046, 286.935] - - [16, 1024, 1, 1024] - - [874, 881.256] + - [1010, 881.256] - - [200, 100, 1, 10] - - [1046, 40.4226] + - [1182, 40.4226] - - [512, 100, 1, 500] - - [1040, 1987.68] + - [1176, 1987.68] - - [512, 2048, 1, 2048] - - [997, 8063.65] + - [1133, 8063.65] - - [16, 2000, 1, 512] - - [896, 1204.81] + - [1032, 1204.81] - - [64, 2048, 1, 1024] - - [894, 2853.37] + - [1030, 2853.37] - - [32, 2048, 1, 10] - - [868, 130.132] + - [1004, 130.132] - - [10, 2048, 1, 10] - - [870, 39.4846] + - [1006, 39.4846] - - [4, 2000, 1, 512] - - [886, 316.149] + - [1022, 316.149] - - [4, 500, 1, 100] - - [885, 35.8143] + - [1021, 35.8143] - - [8, 100, 1, 2048] - - [905, 84.7281] + - [1041, 84.7281] - - [512, 2048, 1, 10] - - [985, 1225.07] + - [1121, 1225.07] - - [512, 100, 1, 10] - - [974, 90.2408] + - [1110, 90.2408] - - [4, 512, 1, 1024] - - [886, 143.348] + - [1022, 143.348] - - [16, 2048, 1, 10] - - [919, 65.1159] + - [1055, 65.1159] - - [500, 2000, 1, 100] - - [970, 4717.08] + - [1106, 4717.08] - - [32, 1024, 1, 2048] - - [913, 1582.86] + - [1049, 1582.86] - - [100, 2000, 1, 2000] - - [1030, 5512.78] + - [1166, 5512.78] - - [128, 100, 1, 512] - - [1056, 561.196] + - [1192, 561.196] - - [500, 500, 1, 100] - - [1010, 2460.73] + - [1146, 2460.73] - - [32, 2000, 1, 10] - - [862, 119.503] + - [998, 119.503] - - [128, 2048, 1, 100] - - [1010, 2708.2] + - [1146, 2708.2] - - [10, 2000, 1, 100] - - [885, 316.556] + - [1021, 316.556] - - [2, 2048, 1, 500] - - [896, 191.145] + - [1032, 191.145] - - [32, 1024, 1, 500] - - [896, 1563.46] + - [1032, 1563.46] - - [4, 1024, 1, 10] - - [925, 9.24286] + - [1061, 9.24286] - - [100, 512, 1, 10] - - [1051, 97.0697] + - [1187, 97.0697] - - [8, 100, 1, 100] - - [901, 14.3857] + - [1037, 14.3857] - - [128, 512, 1, 500] - - [889, 2677.22] + - [1025, 2677.22] - - [16, 100, 1, 2048] - - [912, 161.997] + - [1048, 161.997] - - [2, 1024, 1, 10] - - [925, 4.59123] + - [1061, 4.59123] - - [4, 100, 1, 2048] - - [905, 41.8959] + - [1041, 41.8959] - - [4, 512, 1, 2000] - - [905, 180.382] + - [1041, 180.382] - - [4096, 64, 1, 2048] - - [1105, 7247.28] + - [1241, 7247.28] - - [1024, 10080, 1, 1024] - - [1093, 9833.47] + - [1229, 9833.47] - - [1024, 1131, 1, 1024] - - [1071, 7551.95] + - [1207, 7551.95] - - [36548, 1216, 1, 1024] - - [1083, 10351.6] + - [1219, 10351.6] - - [1024, 29, 1, 1024] - - [1115, 1697.01] + - [1251, 1697.01] - - [1024, 2592, 1, 1024] - - [1084, 8424.11] + - [1220, 8424.11] - - [1024, 1568, 1, 1024] - - [1095, 7511.86] + - [1231, 7511.86] - - [4096, 91, 1, 2048] - - [1064, 5599.91] + - [1200, 5599.91] - - [1024, 4445, 1, 1024] - - [1082, 9261.22] + - [1218, 9261.22] - - [1024, 6272, 1, 1024] - - [1077, 9439.61] + - [1213, 9439.61] - - [36548, 3584, 1, 1024] - - [1076, 10393.8] + - [1212, 10393.8] - - [1024, 1827, 1, 1024] - - [1095, 8714.42] + - [1231, 8714.42] - - [1024, 3220, 1, 1024] - - [1075, 8861.2] + - [1211, 8861.2] - - [1024, 1856, 1, 1024] - - [1092, 8827.05] + - [1228, 8827.05] - - [1024, 1760, 1, 1024] - - [1092, 8334.2] + - [1228, 8334.2] - - [1024, 1600, 1, 1024] - - [1092, 7615.07] + - [1228, 7615.07] - - [1024, 1, 1, 21] - - [1096, 0.1] + - [1232, 0.1] - - [36548, 4235, 1, 1024] - - [1076, 10276.8] + - [1212, 10276.8] - - [1024, 49, 1, 1024] - - [1111, 2643.12] + - [1247, 2643.12] - - [1024, 1984, 1, 1024] - - [1095, 9449.52] + - [1231, 9449.52] - - [1024, 14720, 1, 1024] - - [1082, 10033.3] + - [1218, 10033.3] - - [1024, 1152, 1, 1024] - - [1065, 7523.54] + - [1201, 7523.54] - - [36548, 14976, 1, 1024] - - [1083, 10421.7] + - [1219, 10421.7] - - [36548, 1152, 1, 1024] - - [1083, 10258.1] + - [1219, 10258.1] - - [4096, 86, 1, 3072] - - [1064, 5308.85] + - [1200, 5308.85] - - [1024, 3392, 1, 1024] - - [1077, 9176.54] + - [1213, 9176.54] - - [1024, 1408, 1, 1024] - - [1077, 8958.83] + - [1213, 8958.83] - - [1024, 2080, 1, 1024] - - [1068, 8396.49] + - [1204, 8396.49] - - [1024, 1824, 1, 1024] - - [1086, 8671.71] + - [1222, 8671.71] - - [36548, 2432, 1, 1024] - - [1076, 10392.6] + - [1212, 10392.6] - - [4096, 29, 1, 2048] - - [1097, 4325.66] + - [1233, 4325.66] - - [1024, 1102, 1, 1024] - - [1071, 7204.18] + - [1207, 7204.18] - - [4096, 49, 1, 2048] - - [1103, 5609.29] + - [1239, 5609.29] - - [36548, 1827, 1, 1024] - - [1083, 10183.2] + - [1219, 10183.2] - - [4096, 25, 1, 2048] - - [1098, 3788.31] + - [1234, 3788.31] - - [1024, 10176, 1, 1024] - - [1093, 9941.18] + - [1229, 9941.18] - - [1024, 774, 1, 1024] - - [1078, 7079.67] + - [1214, 7079.67] - - [1024, 1952, 1, 1024] - - [1095, 9300.49] + - [1231, 9300.49] - - [4096, 128, 1, 2048] - - [1065, 8274.96] + - [1201, 8274.96] - - [1024, 17024, 1, 1024] - - [1075, 9960.72] + - [1211, 9960.72] - - [1024, 1472, 1, 1024] - - [1084, 9343.37] + - [1220, 9343.37] - - [36548, 4459, 1, 1024] - - [1076, 10358.1] + - [1212, 10358.1] - - [4096, 91, 1, 3072] - - [1070, 5509.39] + - [1206, 5509.39] - - [1024, 3712, 1, 1024] - - [1084, 9048.66] + - [1220, 9048.66] - - [4096, 64, 1, 3072] - - [1117, 7489.93] + - [1253, 7489.93] - - [4096, 29, 1, 3072] - - [1097, 4511.78] + - [1233, 4511.78] - - [4096, 128, 1, 3072] - - [1064, 8423.83] + - [1200, 8423.83] - - [36548, 12928, 1, 1024] - - [1083, 10426.1] + - [1219, 10426.1] - - [1024, 1632, 1, 1024] - - [1065, 7761.73] + - [1201, 7761.73] - - [1024, 1696, 1, 1024] - - [1090, 8107.29] + - [1226, 8107.29] - - [4096, 24, 1, 2048] - - [1097, 3663.25] + - [1233, 3663.25] - - [4096, 63, 1, 3072] - - [1106, 7175.37] + - [1242, 7175.37] - - [4096, 96, 1, 2048] - - [1065, 5866.28] + - [1201, 5866.28] - - [36548, 1764, 1, 1024] - - [1076, 10128.5] + - [1212, 10128.5] - - [4096, 32, 1, 2048] - - [1101, 4540.62] + - [1237, 4540.62] - - [1024, 35, 1, 1024] - - [1109, 1911.57] + - [1245, 1911.57] - - [1024, 1120, 1, 1024] - - [1064, 7289.13] + - [1200, 7289.13] - - [4096, 49, 1, 3072] - - [1103, 5751.62] + - [1239, 5751.62] - - [1024, 24, 1, 1024] - - [1109, 1392.02] + - [1245, 1392.02] - - [1024, 2944, 1, 1024] - - [1085, 9284.93] + - [1221, 9284.93] - - [36548, 14080, 1, 1024] - - [1076, 10441.4] + - [1212, 10441.4] - - [1024, 1, 1, 1024] - - [1096, 0.1] + - [1232, 0.1] - - [1024, 1280, 1, 1024] - - [1064, 8244.46] + - [1200, 8244.46] - - [1024, 13440, 1, 1024] - - [1076, 9799.92] + - [1212, 9799.92] - - [1024, 1015, 1, 1024] - - [1084, 9187.85] + - [1220, 9187.85] - - [36548, 9120, 1, 1024] - - [1076, 10400.0] + - [1212, 10400.0] - - [36548, 1, 1, 1024] - - [1096, 0.1] + - [1232, 0.1] - - [1024, 3008, 1, 1024] - - [1085, 9468.55] + - [1221, 9468.55] - - [1024, 2560, 1, 1024] - - [1082, 8879.31] + - [1218, 8879.31] - - [1024, 21, 1, 1024] - - [1108, 1234.41] + - [1244, 1234.41] - - [1024, 2208, 1, 1024] - - [1064, 8231.27] + - [1200, 8231.27] - - [1024, 96, 1, 1024] - - [1114, 3767.44] + - [1250, 3767.44] - - [4096, 86, 1, 2048] - - [1065, 5529.09] + - [1201, 5529.09] - - [4096, 96, 1, 3072] - - [1064, 6273.28] + - [1200, 6273.28] - - [1024, 1920, 1, 1024] - - [1094, 9118.19] + - [1230, 9118.19] - - [4096, 27, 1, 2048] - - [1097, 4073.7] + - [1233, 4073.7] - - [36548, 2496, 1, 1024] - - [1076, 10361.2] + - [1212, 10361.2] - - [1024, 1, 1, 14] - - [1096, 0.1] + - [1232, 0.1] - - [1024, 91, 1, 1024] - - [1116, 3647.67] + - [1252, 3647.67] - - [1024, 2016, 1, 1024] - - [1092, 9560.24] + - [1228, 9560.24] - - [1024, 1184, 1, 1024] - - [1065, 7678.96] + - [1201, 7678.96] - - [4096, 1, 1, 2048] - - [1096, 0.1] + - [1232, 0.1] - - [1024, 1664, 1, 1024] - - [1090, 7934.07] + - [1226, 7934.07] - - [1024, 11424, 1, 1024] - - [1082, 9777.91] + - [1218, 9777.91] - - [4096, 24, 1, 3072] - - [1100, 3813.1] + - [1236, 3813.1] - - [1024, 1216, 1, 1024] - - [1064, 7902.13] + - [1200, 7902.13] - - [36548, 3185, 1, 1024] - - [1076, 10336.7] + - [1212, 10336.7] - - [36548, 9216, 1, 1024] - - [1076, 10414.3] + - [1212, 10414.3] - - [1024, 3200, 1, 1024] - - [1082, 8847.01] + - [1218, 8847.01] - - [1024, 2656, 1, 1024] - - [1077, 8649.25] + - [1213, 8649.25] - - [1024, 2368, 1, 1024] - - [1077, 8873.16] + - [1213, 8873.16] - - [1024, 4459, 1, 1024] - - [1084, 9431.32] + - [1220, 9431.32] - - [1024, 3808, 1, 1024] - - [1084, 9263.72] + - [1220, 9263.72] - - [1024, 2336, 1, 1024] - - [1077, 8966.0] + - [1213, 8966.0] - - [4096, 27, 1, 3072] - - [1097, 4171.74] + - [1233, 4171.74] - - [1024, 2304, 1, 1024] - - [1074, 8601.38] + - [1210, 8601.38] - - [1024, 1560, 1, 1024] - - [1089, 7481.74] + - [1225, 7481.74] - - [4096, 35, 1, 3072] - - [1103, 4176.9] + - [1239, 4176.9] - - [1024, 2496, 1, 1024] - - [1080, 9092.86] + - [1216, 9092.86] - - [1024, 1504, 1, 1024] - - [1080, 9220.53] + - [1216, 9220.53] - - [4096, 50, 1, 2048] - - [1104, 5472.83] + - [1240, 5472.83] - - [1024, 3232, 1, 1024] - - [1077, 8961.94] + - [1213, 8961.94] - - [1024, 14, 1, 1024] - - [1108, 882.315] + - [1244, 882.315] - - [36548, 1015, 1, 1024] - - [1076, 10140.9] + - [1212, 10140.9] - - [1024, 2000, 1, 1024] - - [1088, 9487.8] + - [1224, 9487.8] - - [36548, 243, 1, 1024] - - [1081, 9441.12] + - [1217, 9441.12] - - [36548, 32, 1, 1024] - - [1069, 4721.05] + - [1205, 4721.05] - - [1024, 25, 1, 1024] - - [1115, 1462.96] + - [1251, 1462.96] - - [1024, 13184, 1, 1024] - - [1079, 9866.28] + - [1215, 9866.28] - - [1024, 2688, 1, 1024] - - [1074, 8559.93] + - [1210, 8559.93] - - [1024, 27, 1, 1024] - - [1113, 1559.11] + - [1249, 1559.11] - - [36548, 950, 1, 1024] - - [1083, 10053.6] + - [1219, 10053.6] - - [1024, 1764, 1, 1024] - - [1090, 8347.11] + - [1226, 8347.11] - - [1024, 992, 1, 1024] - - [1077, 9035.82] + - [1213, 9035.82] - - [1024, 1376, 1, 1024] - - [1077, 8797.96] + - [1213, 8797.96] - - [1024, 950, 1, 1024] - - [1084, 8635.26] + - [1220, 8635.26] - - [36548, 774, 1, 1024] - - [1076, 9460.82] + - [1212, 9460.82] - - [36548, 25, 1, 1024] - - [1069, 3694.16] + - [1205, 3694.16] - - [1024, 4256, 1, 1024] - - [1077, 9172.16] + - [1213, 9172.16] - - [4096, 32, 1, 3072] - - [1098, 4886.67] + - [1234, 4886.67] - - [1024, 243, 1, 1024] - - [1102, 6594.41] + - [1238, 6594.41] - - [36548, 3712, 1, 1024] - - [1076, 10401.6] + - [1212, 10401.6] - - [1024, 50, 1, 1024] - - [1111, 2742.19] + - [1247, 2742.19] - - [1024, 3360, 1, 1024] - - [1073, 9017.37] + - [1209, 9017.37] - - [1024, 2048, 1, 1024] - - [1088, 9736.65] + - [1224, 9736.65] - - [1024, 2784, 1, 1024] - - [1084, 8835.6] + - [1220, 8835.6] - - [1024, 4992, 1, 1024] - - [1082, 9639.38] + - [1218, 9639.38] - - [36548, 1102, 1, 1024] - - [1083, 9859.04] + - [1219, 9859.04] - - [1024, 1536, 1, 1024] - - [1075, 9294.98] + - [1211, 9294.98] - - [1024, 2720, 1, 1024] - - [1080, 8617.88] + - [1216, 8617.88] - - [4096, 1, 1, 3072] - - [1096, 0.1] + - [1232, 0.1] - - [1024, 2752, 1, 1024] - - [1084, 8902.17] + - [1220, 8902.17] - - [1024, 2816, 1, 1024] - - [1082, 8906.95] + - [1218, 8906.95] - - [1024, 2624, 1, 1024] - - [1084, 8494.41] + - [1220, 8494.41] - - [1024, 2144, 1, 1024] - - [1067, 8243.56] + - [1203, 8243.56] - - [36548, 1131, 1, 1024] - - [1083, 10104.6] + - [1219, 10104.6] - - [4096, 25, 1, 3072] - - [1098, 3959.98] + - [1234, 3959.98] - - [1024, 64, 1, 1024] - - [1111, 3410.1] + - [1247, 3410.1] - - [1024, 3296, 1, 1024] - - [1082, 9066.52] + - [1218, 9066.52] - - [36548, 4992, 1, 1024] - - [1076, 10395.6] + - [1212, 10395.6] - - [1024, 1344, 1, 1024] - - [1077, 8522.66] + - [1213, 8522.66] - - [36548, 2401, 1, 1024] - - [1076, 10250.3] + - [1212, 10250.3] - - [1024, 15744, 1, 1024] - - [1076, 10006.4] + - [1212, 10006.4] - - [1024, 15232, 1, 1024] - - [1075, 9912.21] + - [1211, 9912.21] - - [1024, 1888, 1, 1024] - - [1087, 8962.98] + - [1223, 8962.98] - - [1024, 1792, 1, 1024] - - [1091, 8556.82] + - [1227, 8556.82] - - [36548, 1073, 1, 1024] - - [1076, 10161.2] + - [1212, 10161.2] - - [4096, 50, 1, 3072] - - [1103, 5882.16] + - [1239, 5882.16] - - [36548, 15488, 1, 1024] - - [1083, 10437.1] + - [1219, 10437.1] - - [1024, 2464, 1, 1024] - - [1080, 8880.02] + - [1216, 8880.02] - - [1024, 2272, 1, 1024] - - [1077, 8720.35] + - [1213, 8720.35] - - [1024, 13, 1, 1024] - - [1107, 774.616] + - [1243, 774.616] - - [1024, 2432, 1, 1024] - - [1082, 8491.53] + - [1218, 8491.53] - - [36548, 24, 1, 1024] - - [1069, 3564.41] + - [1205, 3564.41] - - [1024, 3936, 1, 1024] - - [1092, 9433.3] + - [1228, 9433.3] - - [36548, 13824, 1, 1024] - - [1076, 10439.8] + - [1212, 10439.8] - - [1024, 2401, 1, 1024] - - [1084, 8870.03] + - [1220, 8870.03] - - [1024, 32, 1, 1024] - - [1099, 1839.71] + - [1235, 1839.71] - - [1024, 2176, 1, 1024] - - [1068, 8544.55] + - [1204, 8544.55] - - [1024, 2240, 1, 1024] - - [1077, 8381.55] + - [1213, 8381.55] - - [1024, 1728, 1, 1024] - - [1065, 8212.33] + - [1201, 8212.33] - - [1024, 128, 1, 1024] - - [1112, 4660.44] + - [1248, 4660.44] - - [1024, 216, 1, 1024] - - [1102, 5777.97] + - [1238, 5777.97] - - [1024, 63, 1, 1024] - - [1110, 3329.75] + - [1246, 3329.75] - - [1024, 86, 1, 1024] - - [1116, 3533.7] + - [1252, 3533.7] - - [1024, 2528, 1, 1024] - - [1072, 8789.25] + - [1208, 8789.25] - - [1024, 2400, 1, 1024] - - [1077, 8939.4] + - [1213, 8939.4] - - [1024, 1440, 1, 1024] - - [1084, 9131.41] + - [1220, 9131.41] - - [1024, 2912, 1, 1024] - - [1077, 9140.03] + - [1213, 9140.03] - - [4096, 35, 1, 2048] - - [1103, 4059.85] + - [1239, 4059.85] - - [4096, 63, 1, 2048] - - [1105, 6946.5] + - [1241, 6946.5] - - [1024, 2880, 1, 1024] - - [1075, 9104.98] + - [1211, 9104.98] - - [1024, 4064, 1, 1024] - - [1094, 9715.2] + - [1230, 9715.2] - - [1024, 4655, 1, 1024] - - [1082, 9033.9] + - [1218, 9033.9] - - [1024, 1088, 1, 1024] - - [1066, 8144.41] + - [1202, 8144.41] - - [36548, 6272, 1, 1024] - - [1083, 10427.4] + - [1219, 10427.4] - - [1024, 1, 1, 13] - - [1096, 0.1] + - [1232, 0.1] + - - [768, 512, 1, 768] + - [1256, 5889.14] + - - [768, 2048, 1, 3072] + - [1266, 9394.72] + - - [768, 32, 1, 768] + - [1278, 1502.84] + - - [64, 128, 96, 128] + - [1273, 4973.58] + - - [3072, 1024, 1, 768] + - [1267, 9856.17] + - - [768, 1024, 1, 3072] + - [1260, 8611.16] + - - [768, 512, 1, 3072] + - [1259, 6430.89] + - - [768, 64, 1, 768] + - [1280, 2621.54] + - - [768, 4096, 1, 3072] + - [1265, 10030.5] + - - [768, 2048, 1, 2] + - [1258, 381.863] + - - [768, 2048, 1, 768] + - [1263, 9754.3] + - - [768, 320, 1, 30522] + - [1276, 8529.5] + - - [64, 64, 96, 64] + - [1270, 2496.71] + - - [768, 640, 1, 30522] + - [1257, 8253.94] + - - [768, 1280, 1, 30522] + - [1262, 9572.95] + - - [768, 1280, 1, 768] + - [1266, 8714.03] + - - [768, 640, 1, 768] + - [1256, 7293.13] + - - [768, 32, 1, 2] + - [1268, 11.9154] + - - [3072, 2048, 1, 768] + - [1263, 10019.7] + - - [768, 4096, 1, 768] + - [1263, 9927.45] + - - [3072, 4096, 1, 768] + - [1266, 10150.2] + - - [64, 256, 192, 256] + - [1272, 7054.29] + - - [768, 8, 1, 768] + - [1279, 341.039] + - - [64, 128, 384, 128] + - [1271, 6765.11] + - - [768, 1024, 1, 768] + - [1261, 8768.68] + - - [768, 320, 1, 768] + - [1277, 6838.64] + - - [64, 64, 768, 64] + - [1274, 5388.93] + - - [768, 1024, 1, 2] + - [1254, 258.795] + - - [768, 16, 1, 768] + - [1279, 819.3] + - - [64, 256, 96, 256] + - [1272, 5893.74] + - - [3072, 512, 1, 768] + - [1264, 9722.89] + - - [768, 160, 1, 768] + - [1281, 5019.88] + - - [768, 4096, 1, 2] + - [1255, 507.475] + - - [1600, 512, 1, 1024] + - [1285, 7187.05] + - - [1024, 512, 1, 64] + - [1283, 2557.6] + - - [1024, 512, 1, 1] + - [1282, 71.3348] + - - [2048, 512, 1, 1] + - [1284, 90.4945] + - - [1024, 200, 1, 1] + - [1290, 40.1] + - - [32, 200, 1, 1] + - [1286, 1.66863] + - - [560, 200, 1, 1024] + - [1294, 4731.45] + - - [1, 512, 1, 1] + - [1293, 0.230612] + - - [64, 512, 1, 1] + - [1288, 7.68519] + - - [1024, 8192, 1, 256] + - [1303, 9519.09] + - - [1024, 22016, 1, 256] + - [1309, 9881.22] + - - [256, 8976, 1, 4352] + - [1301, 9567.18] + - - [512, 256, 1, 2048] + - [1314, 5917.99] + - - [1024, 19968, 1, 256] + - [1309, 9882.47] + - - [256, 8976, 1, 1536] + - [1299, 8437.45] + - - [256, 8976, 1, 33536] + - [1299, 8441.99] + - - [1024, 1792, 1, 256] + - [1299, 7757.07] + - - [1024, 21504, 1, 256] + - [1309, 9894.0] + - - [512, 215, 1, 2048] + - [1315, 4665.74] + - - [1024, 7168, 1, 256] + - [1303, 9509.45] + - - [256, 8976, 1, 15872] + - [1305, 8914.75] + - - [1024, 19712, 1, 256] + - [1309, 9772.0] + - - [256, 8976, 1, 5632] + - [1305, 8740.13] + - - [1024, 14848, 1, 256] + - [1309, 9756.25] + - - [1024, 28672, 1, 256] + - [1309, 9959.02] + - - [256, 8976, 1, 9728] + - [1312, 8853.14] + - - [1024, 17152, 1, 256] + - [1303, 9737.4] + - - [256, 8976, 1, 11520] + - [1305, 8999.3] + - - [256, 8976, 1, 8192] + - [1295, 7897.42] + - - [1024, 3328, 1, 256] + - [1310, 8593.63] + - - [256, 8976, 1, 7424] + - [1305, 8980.57] + - - [1024, 18944, 1, 256] + - [1309, 9854.95] + - - [1024, 10496, 1, 256] + - [1304, 9454.0] + - - [256, 8976, 1, 5376] + - [1302, 9608.47] + - - [256, 8976, 1, 6144] + - [1299, 7880.23] + - - [1024, 40448, 1, 256] + - [1309, 10016.7] + - - [256, 8976, 1, 22016] + - [1312, 8939.97] + - - [256, 8976, 1, 4864] + - [1300, 9211.53] + - - [256, 8976, 1, 12288] + - [1296, 8065.15] + - - [1024, 9728, 1, 256] + - [1309, 9636.35] + - - [256, 8976, 1, 2048] + - [1297, 7001.43] + - - [1024, 10240, 1, 256] + - [1303, 9620.06] + - - [256, 8976, 1, 2304] + - [1301, 9509.84] + - - [1024, 7936, 1, 256] + - [1309, 9300.77] + - - [768, 256, 1, 2048] + - [1313, 6268.05] + - - [1024, 9984, 1, 256] + - [1309, 9477.38] + - - [1024, 13312, 1, 256] + - [1309, 9758.66] + - - [1024, 16128, 1, 256] + - [1303, 9722.0] + - - [1024, 8960, 1, 256] + - [1304, 9398.35] + - - [1024, 5120, 1, 256] + - [1310, 9315.6] + - - [1024, 11264, 1, 256] + - [1303, 9664.9] + - - [256, 8976, 1, 20480] + - [1311, 8279.97] + - - [1024, 20992, 1, 256] + - [1303, 9878.97] + - - [256, 8976, 1, 9472] + - [1305, 8991.06] + - - [256, 8976, 1, 8448] + - [1305, 8983.62] + - - [256, 8976, 1, 20992] + - [1306, 8942.21] + - - [256, 8976, 1, 10496] + - [1306, 8989.81] + - - [1024, 15104, 1, 256] + - [1304, 9676.11] + - - [1024, 6400, 1, 256] + - [1312, 9145.99] + - - [1024, 4096, 1, 256] + - [1305, 9124.35] + - - [256, 8976, 1, 2560] + - [1299, 8566.21] + - - [256, 8976, 1, 2816] + - [1301, 9496.94] + - - [1024, 7680, 1, 256] + - [1309, 9460.94] + - - [256, 8976, 1, 14336] + - [1306, 8226.9] + - - [256, 8976, 1, 6656] + - [1306, 8771.52] + - - [1024, 3072, 1, 256] + - [1306, 9077.04] + - - [256, 8976, 1, 5888] + - [1302, 9546.4] + - - [1024, 12288, 1, 256] + - [1303, 9690.91] + - - [256, 8976, 1, 26112] + - [1308, 8699.93] + - - [1024, 7424, 1, 256] + - [1310, 9256.94] + - - [256, 8976, 1, 14848] + - [1311, 8885.89] + - - [768, 215, 1, 2048] + - [1313, 5628.69] + - - [1024, 2560, 1, 256] + - [1306, 8820.93] + - - [256, 8976, 1, 19968] + - [1305, 8928.96] + - - [256, 8976, 1, 9984] + - [1305, 8993.22] + - - [1024, 4864, 1, 256] + - [1306, 8974.4] + - - [1024, 33536, 1, 256] + - [1309, 9943.17] + - - [256, 8976, 1, 15104] + - [1306, 8996.73] + - - [1024, 2048, 1, 256] + - [1304, 8462.76] + - - [256, 8976, 1, 8960] + - [1306, 8999.02] + - - [1024, 6144, 1, 256] + - [1311, 9359.77] + - - [1024, 14592, 1, 256] + - [1309, 9667.52] + - - [256, 8976, 1, 19712] + - [1305, 9020.21] + - - [1024, 11520, 1, 256] + - [1304, 9527.8] + - - [1024, 5632, 1, 256] + - [1303, 9297.3] + - - [256, 8976, 1, 11008] + - [1312, 8994.9] + - - [256, 8976, 1, 17152] + - [1306, 9003.9] + - - [256, 8976, 1, 3072] + - [1295, 8262.06] + - - [1024, 3840, 1, 256] + - [1312, 8671.99] + - - [1024, 14336, 1, 256] + - [1309, 9760.38] + - - [1024, 20480, 1, 256] + - [1303, 9887.95] + - - [1024, 23552, 1, 256] + - [1303, 9890.56] + - - [256, 8976, 1, 7168] + - [1298, 8478.44] + - - [1024, 13568, 1, 256] + - [1303, 9654.74] + - - [1024, 4608, 1, 256] + - [1311, 9218.35] + - - [256, 8976, 1, 10240] + - [1296, 8076.26] + - - [1024, 8704, 1, 256] + - [1305, 9475.6] + - - [1024, 11008, 1, 256] + - [1309, 9525.06] + - - [1024, 8448, 1, 256] + - [1303, 9352.26] + - - [256, 8976, 1, 44505] + - [1307, 8430.33] + - - [6272, 256, 1, 528] + - [1359, 7390.04] + - - [3136, 2048, 1, 1024] + - [1340, 9658.04] + - - [6272, 112, 1, 512] + - [1338, 5931.19] + - - [2048, 320, 1, 1280] + - [1358, 7773.09] + - - [289, 256, 1, 1568] + - [1379, 3718.27] + - - [3136, 64, 64, 64] + - [1318, 8201.25] + - - [50176, 128, 1, 256] + - [1341, 8908.68] + - - [5329, 64, 1, 448] + - [1324, 4602.3] + - - [289, 192, 1, 1344] + - [1376, 3452.69] + - - [12544, 1024, 1, 256] + - [1341, 9742.74] + - - [784, 64, 32, 192] + - [1317, 6844.71] + - - [6272, 64, 1, 480] + - [1325, 5562.34] + - - [196, 128, 1, 800] + - [1367, 1639.84] + - - [64, 512, 1, 1344] + - [1366, 2313.14] + - - [6272, 64, 1, 512] + - [1324, 5609.29] + - - [6272, 160, 1, 528] + - [1325, 6149.8] + - - [289, 160, 32, 768] + - [1352, 6637.92] + - - [12544, 256, 1, 1024] + - [1359, 8790.56] + - - [289, 224, 1, 1568] + - [1379, 3270.27] + - - [5329, 64, 32, 160] + - [1332, 9091.14] + - - [5329, 96, 1, 576] + - [1359, 5555.76] + - - [3025, 64, 1, 363] + - [1377, 4392.4] + - - [784, 32, 32, 192] + - [1348, 5633.9] + - - [3136, 512, 1, 1024] + - [1344, 7553.24] + - - [6272, 16, 1, 480] + - [1379, 3219.95] + - - [1225, 64, 32, 288] + - [1339, 8240.68] + - - [64, 256, 1, 1536] + - [1372, 1456.46] + - - [289, 192, 32, 768] + - [1351, 7372.9] + - - [2048, 448, 1, 1280] + - [1334, 8403.11] + - - [3136, 2048, 1, 512] + - [1333, 9486.41] + - - [289, 256, 1, 2016] + - [1379, 3876.18] + - - [289, 384, 32, 1024] + - [1318, 7350.64] + - - [1568, 32, 1, 832] + - [1368, 2717.97] + - - [3136, 64, 32, 64] + - [1321, 7657.36] + - - [289, 160, 1, 1120] + - [1375, 2827.0] + - - [6272, 128, 1, 528] + - [1329, 6926.36] + - - [21609, 32, 1, 288] + - [1330, 3699.0] + - - [1225, 192, 1, 1728] + - [1363, 7309.91] + - - [4096, 512, 1, 4096] + - [1346, 10272.2] + - - [64, 256, 1, 1152] + - [1372, 1387.92] + - - [6272, 96, 1, 480] + - [1360, 6371.66] + - - [784, 96, 1, 800] + - [1380, 3330.37] + - - [2048, 448, 1, 2048] + - [1334, 8622.75] + - - [784, 96, 32, 192] + - [1349, 7092.46] + - - [3136, 64, 64, 256] + - [1342, 9579.26] + - - [289, 224, 1, 1344] + - [1379, 3180.11] + - - [1001, 512, 1, 4096] + - [1320, 8195.17] + - - [2048, 192, 1, 1280] + - [1325, 6120.19] + - - [1225, 64, 32, 256] + - [1330, 8076.72] + - - [2048, 256, 1, 1536] + - [1320, 8137.8] + - - [1225, 64, 1, 1200] + - [1379, 3552.97] + - - [6272, 128, 1, 512] + - [1333, 6878.31] + - - [729, 192, 1, 1600] + - [1378, 5016.87] + - - [289, 192, 1, 896] + - [1376, 3091.97] + - - [1568, 384, 1, 832] + - [1359, 6934.72] + - - [784, 16, 32, 192] + - [1350, 3380.38] + - - [1568, 256, 1, 832] + - [1324, 5980.96] + - - [1568, 48, 1, 832] + - [1381, 3275.19] + - - [1568, 192, 1, 832] + - [1319, 4441.21] + - - [289, 192, 32, 1024] + - [1322, 6563.16] + - - [6272, 32, 1, 528] + - [1363, 4998.77] + - - [49, 128, 1, 1200] + - [1364, 550.275] + - - [1225, 64, 32, 384] + - [1336, 8589.43] + - - [289, 128, 1, 896] + - [1375, 2103.2] + - - [1568, 160, 1, 832] + - [1363, 6995.15] + - - [1001, 32, 1, 1024] + - [1372, 1744.82] + - - [2048, 320, 1, 2048] + - [1357, 7118.14] + - - [2048, 384, 1, 1536] + - [1320, 8184.11] + - - [50176, 512, 1, 256] + - [1332, 9852.5] + - - [289, 256, 1, 1792] + - [1381, 3809.85] + - - [64, 448, 1, 1152] + - [1373, 2128.33] + - - [5041, 96, 1, 576] + - [1358, 5279.4] + - - [6272, 192, 1, 480] + - [1320, 7479.75] + - - [784, 32, 32, 256] + - [1347, 5709.01] + - - [1001, 32, 1, 2048] + - [1374, 2141.14] + - - [289, 192, 1, 1120] + - [1370, 3277.87] + - - [6272, 32, 1, 512] + - [1362, 4978.8] + - - [289, 384, 1, 3456] + - [1379, 5904.24] + - - [289, 384, 1, 2592] + - [1380, 5707.44] + - - [784, 128, 64, 512] + - [1326, 8864.49] + - - [12544, 1024, 1, 512] + - [1341, 10008.4] + - - [12544, 256, 1, 512] + - [1359, 8628.18] + - - [6272, 24, 1, 512] + - [1363, 3568.17] + - - [5041, 192, 1, 720] + - [1334, 8424.52] + - - [64, 320, 1, 1728] + - [1367, 1469.76] + - - [784, 128, 32, 256] + - [1335, 8104.24] + - - [289, 96, 1, 864] + - [1373, 1838.35] + - - [1225, 32, 32, 192] + - [1354, 5949.82] + - - [1568, 128, 1, 832] + - [1362, 5718.79] + - - [289, 128, 32, 768] + - [1320, 7289.35] + - - [3136, 256, 64, 64] + - [1328, 9104.02] + - - [196, 64, 1, 800] + - [1366, 915.72] + - - [4096, 512, 1, 9216] + - [1343, 10351.5] + - - [12544, 64, 1, 147] + - [1333, 5069.43] + - - [784, 32, 1, 400] + - [1364, 1140.46] + - - [6272, 160, 1, 512] + - [1324, 6140.18] + - - [1225, 48, 32, 288] + - [1330, 5978.71] + - - [64, 320, 1, 2880] + - [1371, 1920.1] + - - [1225, 64, 32, 192] + - [1324, 7641.11] + - - [1001, 32, 1, 1536] + - [1372, 2084.89] + - - [784, 64, 32, 256] + - [1316, 6990.61] + - - [64, 384, 1, 1152] + - [1373, 1862.7] + - - [784, 512, 64, 128] + - [1327, 9026.05] + - - [3136, 512, 1, 2048] + - [1345, 7764.4] + - - [6272, 144, 1, 512] + - [1320, 5574.14] + - - [1225, 192, 32, 384] + - [1334, 9373.93] + - - [64, 192, 1, 1728] + - [1372, 1206.56] + - - [8192, 320, 1, 1280] + - [1386, 9876.02] + - - [8192, 320, 1, 2048] + - [1389, 9745.8] + - - [8192, 384, 1, 1280] + - [1386, 10046.3] + - - [8192, 192, 1, 1280] + - [1389, 9951.0] + - - [8192, 192, 1, 2048] + - [1385, 9559.77] + - - [8192, 384, 1, 2048] + - [1387, 9945.84] + - - [8192, 448, 1, 2048] + - [1388, 9908.61] + - - [1001, 64, 1, 1536] + - [1382, 3650.04] + - - [8192, 448, 1, 1280] + - [1386, 9981.45] + - - [1001, 64, 1, 2048] + - [1383, 3580.97] + - - [1001, 128, 1, 2048] + - [1384, 5587.97] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_SB.yaml index ea1cea1b2..cbbb723c2 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_SB.yaml @@ -39633,8 +39633,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -39797,8 +39797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -39961,8 +39961,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40125,8 +40125,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40289,8 +40289,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40453,8 +40453,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40617,8 +40617,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40781,8 +40781,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40945,8 +40945,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41109,8 +41109,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41273,8 +41273,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41437,8 +41437,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41601,8 +41601,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41765,8 +41765,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41925,8 +41925,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42089,8 +42089,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42253,8 +42253,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42417,8 +42417,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42581,8 +42581,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42745,8 +42745,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42909,8 +42909,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43073,8 +43073,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43237,8 +43237,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43401,8 +43401,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43566,8 +43566,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43733,8 +43733,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43898,8 +43898,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44061,8 +44061,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44226,8 +44226,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44393,8 +44393,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44558,8 +44558,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44721,8 +44721,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44886,8 +44886,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45053,8 +45053,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45218,8 +45218,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45381,8 +45381,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45546,8 +45546,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45713,8 +45713,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45878,8 +45878,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46041,8 +46041,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46206,8 +46206,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46371,8 +46371,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46538,8 +46538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46703,8 +46703,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46868,8 +46868,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47033,8 +47033,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47198,8 +47198,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47361,8 +47361,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47526,8 +47526,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47693,8 +47693,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47858,8 +47858,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48021,8 +48021,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48186,8 +48186,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48353,8 +48353,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48518,8 +48518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48681,8 +48681,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48848,8 +48848,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49011,8 +49011,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49178,8 +49178,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49341,8 +49341,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49502,8 +49502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49665,8 +49665,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49826,8 +49826,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49987,8 +49987,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50146,8 +50146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50309,8 +50309,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50468,8 +50468,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50631,8 +50631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50790,8 +50790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50953,8 +50953,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51112,8 +51112,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51275,8 +51275,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51434,8 +51434,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51597,8 +51597,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51758,8 +51758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51917,8 +51917,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52080,8 +52080,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52239,8 +52239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52400,8 +52400,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52561,8 +52561,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52728,8 +52728,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52897,8 +52897,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53064,8 +53064,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53229,8 +53229,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53396,8 +53396,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53445,24 +53445,24 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -53470,32 +53470,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53504,9 +53509,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53514,26 +53519,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53543,6 +53556,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53552,6 +53566,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53566,39 +53581,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 341 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53606,56 +53629,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53663,19 +53687,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -53683,6 +53714,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53692,6 +53724,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53701,6 +53734,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53715,39 +53749,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 342 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 2, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53764,32 +53806,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 5120 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetB: 4096 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53798,9 +53841,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53808,26 +53851,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53837,6 +53888,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53846,6 +53898,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53860,45 +53913,53 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 343 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id002 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -53909,36 +53970,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53947,9 +54005,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53957,26 +54015,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53986,6 +54052,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53995,6 +54062,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54009,33 +54077,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 344 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54048,40 +54124,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -54095,10 +54172,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54106,19 +54183,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54126,6 +54208,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54135,6 +54218,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54144,6 +54228,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54158,33 +54243,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 345 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 - SubGroup0: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 12 + SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54196,58 +54291,55 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54255,26 +54347,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54284,6 +54382,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54293,6 +54392,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54307,33 +54407,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 346 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id003 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54346,8 +54456,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -54355,31 +54465,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 24 - LSPB: 24 - LVCA: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 4608 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -54393,10 +54504,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54404,19 +54515,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54424,6 +54540,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54433,6 +54550,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54442,6 +54560,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54456,39 +54575,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 347 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id003 + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -54505,47 +54634,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54553,19 +54683,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54573,6 +54710,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54582,6 +54720,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54591,6 +54730,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54605,85 +54745,94 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 348 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 - SubGroup0: 16 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 8 - LVCA: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54691,10 +54840,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54702,19 +54851,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54722,6 +54878,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54731,6 +54888,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54740,6 +54898,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54754,48 +54913,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 349 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id009 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -54806,33 +54973,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54840,10 +55008,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54851,19 +55019,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54871,6 +55046,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54880,6 +55056,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54889,6 +55066,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54903,85 +55081,94 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 350 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54990,9 +55177,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55000,19 +55187,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55020,6 +55214,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55029,6 +55224,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55038,6 +55234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55052,96 +55249,105 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 351 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55149,19 +55355,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55169,6 +55380,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55178,6 +55390,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55187,6 +55400,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55201,46 +55415,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 352 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -55249,44 +55473,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 16 - LVCA: 4 - LVCB: 8 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -55298,19 +55523,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55318,6 +55550,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55327,6 +55560,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55336,6 +55570,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55350,35 +55585,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 353 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55388,9 +55631,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -55398,31 +55641,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 4 + LSPA: 96 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55436,10 +55680,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55447,19 +55691,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55467,6 +55716,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55476,6 +55726,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55485,6 +55736,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55499,35 +55751,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 354 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55537,9 +55799,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -55552,26 +55814,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 + LSPA: 64 + LSPB: 128 + LVCA: 4 LVCB: 2 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55585,10 +55848,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55596,26 +55859,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55625,6 +55896,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55634,6 +55906,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55648,35 +55921,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 355 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55686,41 +55967,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 16 + LSPA: 64 + LSPB: 128 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55734,10 +56016,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55745,19 +56027,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55765,6 +56054,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55774,6 +56064,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55783,6 +56074,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55797,75 +56089,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 356 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3200 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -55882,11 +56183,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55894,26 +56195,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55923,6 +56232,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55932,6 +56242,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55946,35 +56257,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 357 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55984,41 +56303,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56032,10 +56352,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56043,19 +56363,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56063,6 +56390,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56072,6 +56400,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56081,6 +56410,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56095,79 +56425,88 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 358 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id009 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56180,11 +56519,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56192,19 +56531,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56212,6 +56556,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56221,6 +56566,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56230,6 +56576,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56244,35 +56591,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 359 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56282,41 +56639,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56330,10 +56688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56341,19 +56699,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56361,6 +56726,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56370,6 +56736,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56379,6 +56746,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56393,48 +56761,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 360 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -56446,26 +56822,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56478,11 +56855,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56490,19 +56867,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56510,6 +56892,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56519,6 +56902,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56528,6 +56912,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56542,35 +56927,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 361 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56580,58 +56975,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56639,19 +57035,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56659,6 +57060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56668,6 +57070,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56677,6 +57080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56691,35 +57095,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 362 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56729,8 +57143,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -56744,22 +57158,23 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3328 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -56778,29 +57193,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56808,6 +57232,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56817,6 +57242,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56826,8 +57252,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -56840,35 +57268,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 363 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56878,37 +57314,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3584 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -56927,29 +57364,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56957,6 +57403,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56966,6 +57413,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56975,8 +57423,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -56989,35 +57439,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 364 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57027,37 +57485,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3200 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -57076,29 +57535,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57106,6 +57574,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57115,6 +57584,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57124,8 +57594,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57138,35 +57610,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 365 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57176,41 +57656,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57224,30 +57705,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57255,6 +57745,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57264,6 +57755,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57273,8 +57765,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57287,35 +57781,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 366 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id009 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57325,8 +57827,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -57339,27 +57841,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57373,30 +57876,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57404,6 +57914,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57413,6 +57924,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57422,8 +57934,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57436,35 +57950,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 367 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57474,8 +57998,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57488,23 +58012,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3584 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -57523,29 +58048,36 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57553,6 +58085,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57562,6 +58095,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57571,8 +58105,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57585,48 +58121,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 368 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -57634,67 +58180,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 32 - LdcEqualsLdd: false + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57702,6 +58256,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57711,6 +58266,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57720,8 +58276,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57734,14 +58292,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 369 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -57752,105 +58317,113 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57860,6 +58433,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57869,8 +58443,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57883,39 +58459,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 370 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57923,76 +58509,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 32 - LVPB: 32 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58000,6 +58596,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58009,6 +58606,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58018,8 +58616,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58032,33 +58632,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 371 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58081,67 +58689,77 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58149,6 +58767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58158,6 +58777,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58167,8 +58787,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58181,39 +58803,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 372 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -58221,7 +58851,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -58230,36 +58860,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -58267,37 +58898,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58307,6 +58948,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58316,8 +58958,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58330,33 +58974,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 373 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58379,74 +59031,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58456,6 +59119,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58465,8 +59129,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58479,33 +59145,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 374 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58519,76 +59193,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58596,6 +59280,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58605,6 +59290,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58614,8 +59300,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58628,33 +59316,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 375 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58668,76 +59364,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58745,6 +59451,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58754,6 +59461,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58763,8 +59471,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58777,48 +59487,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 376 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -58826,36 +59544,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -58863,30 +59582,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58894,6 +59620,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58903,6 +59630,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58912,8 +59640,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58926,33 +59656,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 377 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58965,77 +59705,85 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3328 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59043,6 +59791,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59052,6 +59801,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59061,8 +59811,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59075,47 +59827,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 378 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -59124,36 +59886,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -59161,37 +59924,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59201,6 +59972,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59210,8 +59982,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59224,33 +59998,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 379 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 1 - WorkGroupMappingType: B + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59263,77 +60047,85 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59341,6 +60133,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59350,6 +60143,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59359,8 +60153,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59373,33 +60169,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 380 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59412,7 +60218,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -59422,67 +60228,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59490,6 +60304,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59499,6 +60314,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59508,8 +60324,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59522,39 +60340,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 381 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59562,33 +60390,34 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 @@ -59600,7 +60429,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -59608,37 +60437,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59648,6 +60487,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59657,8 +60497,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59671,39 +60513,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 382 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59711,8 +60561,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -59720,74 +60570,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59797,6 +60658,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59806,8 +60668,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59820,39 +60684,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 383 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59860,65 +60732,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -59926,17 +60801,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59946,6 +60829,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59955,8 +60839,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59969,39 +60855,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 384 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -60009,45 +60903,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60055,19 +60950,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -60075,17 +60972,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60095,6 +61000,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60104,8 +61010,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60118,33 +61026,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 385 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -60167,13 +61083,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -60183,8 +61100,8 @@ LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 16384 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -60196,14 +61113,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -60211,7 +61128,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -60224,10 +61143,17 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60235,6 +61161,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60244,6 +61171,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60253,8 +61181,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60267,75 +61197,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 386 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 16 + LSPB: 64 LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -60345,45 +61284,53 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60393,6 +61340,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60402,8 +61350,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60416,85 +61366,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 387 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id020 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60502,37 +61463,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60542,6 +61511,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60551,8 +61521,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60565,73 +61537,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 388 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 @@ -60643,7 +61626,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60651,30 +61634,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60682,6 +61672,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60691,6 +61682,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60700,8 +61692,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60714,123 +61708,142 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 389 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 - LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60840,6 +61853,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60849,8 +61863,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60863,123 +61879,142 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 390 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60989,6 +62024,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60998,8 +62034,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -61012,85 +62050,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 391 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61098,37 +62147,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61138,6 +62195,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -61147,8 +62205,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -61161,47 +62221,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 392 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -61209,37 +62279,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61247,30 +62318,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61278,6 +62356,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61287,6 +62366,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -61296,8 +62376,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -61310,33 +62392,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 393 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61350,83 +62442,94 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61436,6 +62539,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -61445,8 +62549,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -61459,33 +62565,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 394 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61499,83 +62613,94 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61585,6 +62710,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -61594,8 +62720,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -61608,33 +62736,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 395 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61648,8 +62784,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -61657,74 +62793,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61734,6 +62881,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -61743,8 +62891,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -61757,33 +62907,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 396 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61798,7 +62956,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -61806,67 +62964,77 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -61874,6 +63042,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -61883,6 +63052,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -61892,8 +63062,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -61906,33 +63078,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 397 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61947,7 +63127,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -61955,67 +63135,77 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62023,6 +63213,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62032,6 +63223,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -62041,8 +63233,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -62055,33 +63249,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 398 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62104,13 +63306,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -62120,10 +63323,10 @@ LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 16384 + LdcEqualsLdd: true + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -62133,38 +63336,47 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62172,6 +63384,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62181,6 +63394,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -62190,8 +63404,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -62204,33 +63420,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 399 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62244,76 +63468,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62321,6 +63555,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62330,6 +63565,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -62339,8 +63575,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -62353,33 +63591,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 400 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62392,9 +63638,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -62402,74 +63648,83 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62479,6 +63734,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -62488,8 +63744,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -62502,33 +63760,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 401 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62541,9 +63809,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -62551,67 +63819,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62619,6 +63895,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62628,6 +63905,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -62637,8 +63915,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -62651,123 +63931,142 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 402 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false + LdcEqualsLdd: true LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62777,6 +64076,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -62786,8 +64086,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -62800,116 +64102,134 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 403 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false + LdcEqualsLdd: true LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -62917,6 +64237,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -62926,6 +64247,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -62935,8 +64257,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -62949,116 +64273,134 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 404 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false + LdcEqualsLdd: true LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63066,6 +64408,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63075,6 +64418,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -63084,8 +64428,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -63098,46 +64444,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 405 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63147,74 +64503,83 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false + LdcEqualsLdd: true LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63224,6 +64589,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -63233,8 +64599,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -63247,46 +64615,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 406 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63296,36 +64674,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -63333,37 +64712,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63373,6 +64760,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -63382,8 +64770,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -63396,46 +64786,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 407 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63445,67 +64845,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63513,6 +64921,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63522,6 +64931,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -63531,8 +64941,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -63545,46 +64957,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 408 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63594,26 +65016,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -63623,38 +65046,45 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63662,6 +65092,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63671,6 +65102,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -63680,8 +65112,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -63694,39 +65128,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 409 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -63734,7 +65178,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -63743,30 +65187,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -63779,31 +65224,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63811,6 +65265,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63820,6 +65275,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -63829,8 +65285,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -63843,39 +65301,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 410 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -63883,7 +65349,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -63892,36 +65358,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -63930,18 +65397,20 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -63949,10 +65418,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -63960,6 +65436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -63969,6 +65446,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -63978,8 +65456,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -63992,39 +65472,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 411 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -64033,7 +65521,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -64041,36 +65529,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -64085,23 +65574,32 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64109,6 +65607,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64118,6 +65617,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -64127,8 +65627,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -64141,39 +65643,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 412 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -64181,45 +65691,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -64234,30 +65745,40 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64267,6 +65788,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -64276,53 +65798,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 413 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -64330,76 +65863,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64407,6 +65950,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64416,6 +65960,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -64425,53 +65970,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 414 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -64480,7 +66036,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -64488,36 +66044,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -64525,30 +66082,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64556,6 +66122,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64565,6 +66132,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -64574,53 +66142,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 415 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id020 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -64628,8 +66207,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -64637,43 +66216,44 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -64681,23 +66261,32 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64705,6 +66294,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64714,6 +66304,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -64723,53 +66314,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 416 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -64777,8 +66379,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -64786,36 +66388,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -64823,19 +66426,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -64843,10 +66448,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -64854,6 +66466,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -64863,6 +66476,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -64872,59 +66486,70 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 417 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -64935,56 +66560,55 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -64992,17 +66616,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65012,6 +66644,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65021,53 +66654,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 418 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -65084,67 +66728,77 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65152,6 +66806,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65161,6 +66816,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65170,53 +66826,64 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 419 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -65233,36 +66900,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65271,29 +66939,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65301,6 +66978,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65310,6 +66988,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65319,93 +66998,105 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 420 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -65418,7 +67109,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -65426,30 +67117,38 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65459,6 +67158,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65468,93 +67168,107 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 421 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -65567,31 +67281,38 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -65599,6 +67320,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65608,6 +67330,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65617,60 +67340,73 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 422 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -65683,27 +67419,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -65716,38 +67449,46 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65757,6 +67498,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65766,49 +67508,62 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 423 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [6, 8] - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65818,8 +67573,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65833,10 +67588,11 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 @@ -65845,14 +67601,14 @@ LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -65866,37 +67622,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -65906,6 +67670,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -65915,62 +67680,75 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 424 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -65982,26 +67760,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66014,7 +67793,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -66022,23 +67801,30 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66046,6 +67832,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66055,6 +67842,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66064,62 +67852,75 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 425 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66131,26 +67932,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66163,31 +67965,38 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -66195,6 +68004,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -66204,6 +68014,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66213,62 +68024,56954 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 426 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 427 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 428 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 429 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 430 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 431 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 432 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 433 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 434 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 435 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 436 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 437 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1536 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 438 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 439 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 440 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 441 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 442 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 443 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 444 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 445 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 446 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 447 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 448 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 449 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 450 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 451 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 452 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 453 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 454 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 455 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 456 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 457 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 458 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 459 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 460 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 461 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 462 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3200 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 128 + LdsOffsetB_Blk: 2176 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 463 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 464 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 465 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 466 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 467 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 468 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 469 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 470 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 471 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 472 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 473 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 474 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 475 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 476 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 477 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 478 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 479 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 480 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 481 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 482 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 483 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 484 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 485 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 486 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 487 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 488 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 489 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 490 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 491 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 492 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 493 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 494 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 495 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 496 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 497 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 498 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 499 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 500 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 501 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 502 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 503 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 504 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 505 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 506 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3200 + LdsOffsetA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 519 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 520 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 2, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 5120 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 521 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 522 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 523 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 524 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 24 + LSPB: 24 + LVCA: 8 + LVCB: 8 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 525 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 526 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 527 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 528 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 529 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 530 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 531 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 532 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 533 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 534 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 535 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 536 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 537 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 538 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 539 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 540 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 541 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 542 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 543 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 544 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 545 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 546 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 547 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 548 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 549 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 550 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 551 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 552 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 553 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 554 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 555 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 556 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 557 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 558 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 559 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 560 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 561 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 562 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 563 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 564 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 565 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 566 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 567 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 568 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 569 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 570 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 571 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 572 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 573 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 574 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 575 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 576 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 577 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 578 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 579 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 580 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 581 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 582 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 583 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 584 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 585 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 586 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 587 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 588 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 589 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 593 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 594 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 595 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 596 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 597 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 598 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 599 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 600 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 601 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [6, 8] + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 602 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 603 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 604 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 605 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 606 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 607 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 608 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 609 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 610 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 611 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 612 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 613 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 614 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 615 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 616 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 617 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 618 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 619 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 620 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 621 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 622 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 623 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 624 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 625 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 626 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 627 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 628 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 629 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 4 + LSCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 630 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 2 + LSCB: 2 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 631 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 632 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 633 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 634 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 635 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 636 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 637 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 638 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id038 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 639 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 640 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 641 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 642 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id037 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 643 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id037 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 644 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id038 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 645 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id040 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 646 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id040 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 647 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 662 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 664 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3136 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 665 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 668 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1600 + LdsOffsetA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 670 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 671 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 672 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 674 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3136 + LdsOffsetA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2624 + LdsOffsetA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3600 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6176 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6176 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 692 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3136 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 695 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 696 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 13376 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4160 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 697 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 698 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2624 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 699 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 701 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 702 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 12864 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 4160 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 512 + LdsOffsetB_Blk: 8704 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 703 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 704 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4224 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4224 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3408 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 710 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 711 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6752 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 713 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 714 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 716 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2144 + LdsOffsetA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 718 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 719 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 720 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6752 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 726 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 727 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 728 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 729 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 731 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 734 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 4160 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 9280 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14464 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 4160 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 736 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 737 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3424 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 738 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 740 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 741 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 742 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 743 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 744 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 745 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4736 + LdsOffsetA: 0 + LdsOffsetB: 4160 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 746 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 747 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 748 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2688 + LdsOffsetA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 755 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 757 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 758 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 759 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 760 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 761 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 762 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 764 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 765 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 766 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 767 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 768 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 769 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 772 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 773 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 775 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 777 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 778 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 779 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 780 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 781 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 16 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 782 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66279,44 +124982,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66324,15 +125027,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66353,6 +125059,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66362,6 +125069,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -66376,35 +125084,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 427 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 783 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -66414,8 +125132,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66425,47 +125143,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66473,15 +125191,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66502,6 +125223,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66511,6 +125233,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -66525,48 +125248,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 428 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 784 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66578,43 +125311,43 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPB: 8 + LVCA: 2 + LVCB: 4 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66622,15 +125355,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66651,6 +125387,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66660,6 +125397,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -66674,33 +125412,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 429 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 785 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 2 + SubGroupA: 8 + SubGroupB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [8, 2, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -66713,9 +125461,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66727,22 +125475,22 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 8 LSCB: 8 LSPA: 128 - LSPB: 128 + LSPB: 64 LVCA: 2 - LVCB: 2 + LVCB: 4 LVPA: 32 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -66761,9 +125509,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66771,15 +125519,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66800,6 +125551,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66809,6 +125561,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -66823,33 +125576,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 430 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 786 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -66862,7 +125625,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66876,26 +125639,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 128 - LSPB: 128 + LSPA: 32 + LSPB: 32 LVCA: 2 LVCB: 2 - LVPA: 32 - LVPB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -66909,10 +125672,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66920,15 +125683,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66949,6 +125715,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -66958,6 +125725,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -66972,33 +125740,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 431 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 787 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67011,7 +125789,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67025,43 +125803,43 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67069,15 +125847,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67098,6 +125879,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -67107,6 +125889,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -67121,33 +125904,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 432 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 788 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 2 + SubGroup1: 8 + SubGroupA: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [2, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67159,8 +125952,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67174,8 +125967,8 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -67187,13 +125980,9 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67208,9 +125997,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67218,20 +126007,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67247,6 +126039,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -67256,6 +126049,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -67270,96 +126064,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 433 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 789 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67367,15 +126171,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67396,6 +126203,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -67405,6 +126213,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -67419,48 +126228,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 434 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 790 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 4 + WorkGroup: [4, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -67472,43 +126291,43 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67516,15 +126335,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67545,6 +126367,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -67554,6 +126377,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -67568,79 +126392,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 435 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 791 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 2 + SubGroupA: 8 + SubGroupB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [8, 2, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -67653,11 +126487,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67665,15 +126499,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67694,6 +126531,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -67703,6 +126541,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -67717,46 +126556,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 436 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 792 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67766,47 +126615,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67814,15 +126663,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67843,6 +126695,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -67852,6 +126705,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -67866,33 +126720,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 437 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 793 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67905,7 +126769,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67919,8 +126783,8 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -67965,13 +126829,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67992,6 +126859,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -68001,6 +126869,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -68015,46 +126884,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 438 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 794 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68068,26 +126947,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68100,11 +126979,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68112,15 +126991,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68141,6 +127023,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -68150,6 +127033,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -68164,33 +127048,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 439 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 795 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68203,57 +127097,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68261,15 +127155,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68290,6 +127187,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -68299,6 +127197,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -68313,33 +127212,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 440 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 796 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 2 + SubGroup1: 8 + SubGroupA: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [2, 8, 4] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68352,9 +127261,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -68366,43 +127275,43 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 16 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68410,15 +127319,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68439,6 +127351,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -68448,6 +127361,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -68462,96 +127376,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 441 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 797 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68559,15 +127483,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68588,6 +127515,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -68597,6 +127525,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -68611,33 +127540,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 442 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 798 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68650,7 +127589,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68663,9 +127602,9 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -68707,16 +127646,19 @@ MinGlobalWriteVectorWidth: 1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68728,6 +127670,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -68737,6 +127680,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -68746,6 +127690,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -68760,39 +127705,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 443 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 799 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68812,27 +127767,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68845,7 +127800,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -68858,14 +127813,19 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68877,6 +127837,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -68886,6 +127847,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -68895,6 +127857,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -68909,39 +127872,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 444 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 800 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68961,27 +127932,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68994,7 +127965,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -69007,14 +127978,19 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69026,6 +128002,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -69035,6 +128012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -69044,6 +128022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -69058,46 +128037,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 445 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 801 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69110,19 +128097,19 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 @@ -69143,11 +128130,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69155,15 +128142,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69175,6 +128165,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -69184,6 +128175,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -69193,6 +128185,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -69207,46 +128200,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 446 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 802 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69259,27 +128262,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69292,10 +128295,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -69304,15 +128307,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69324,6 +128330,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -69333,6 +128340,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -69342,6 +128350,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -69356,39 +128365,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 447 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 803 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -69409,26 +128428,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69441,7 +128460,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -69455,13 +128474,18 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69473,6 +128497,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -69482,6 +128507,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -69491,6 +128517,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -69505,39 +128532,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 448 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 804 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -69558,18 +128593,18 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 @@ -69590,11 +128625,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69602,8 +128637,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -69611,6 +128646,11 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69622,6 +128662,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -69631,6 +128672,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -69640,6 +128682,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -69654,46 +128697,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 449 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 805 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69707,26 +128758,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69739,10 +128790,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -69751,15 +128802,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69771,6 +128825,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -69780,6 +128835,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -69789,6 +128845,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -69803,46 +128860,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 450 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 806 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69856,26 +128923,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69888,7 +128955,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -69902,13 +128969,16 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69920,6 +128990,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -69929,6 +129000,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -69938,6 +129010,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -69952,79 +129025,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 451 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 807 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 4 - LSCB: 4 - LSPA: 16 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70037,11 +129120,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70049,15 +129132,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70069,6 +129157,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -70078,6 +129167,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -70087,6 +129177,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -70101,48 +129192,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 452 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 808 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id032 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70157,23 +129256,23 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 2 - LSCB: 2 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70186,11 +129285,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70198,15 +129297,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70218,6 +129322,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -70227,6 +129332,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -70236,6 +129342,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -70250,33 +129357,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 453 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 809 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id032 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70289,57 +129404,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70347,15 +129462,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70367,6 +129485,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -70376,6 +129495,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -70385,6 +129505,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -70399,96 +129520,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 454 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 810 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70496,15 +129627,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70516,6 +129650,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -70525,6 +129660,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -70534,6 +129670,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -70548,39 +129685,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 455 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 811 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70588,8 +129735,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70597,47 +129744,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70645,15 +129792,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70665,6 +129817,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -70674,6 +129827,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -70683,6 +129837,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -70697,39 +129852,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 456 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 812 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70737,8 +129900,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70746,47 +129909,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70794,15 +129957,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70814,6 +129982,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -70823,6 +129992,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -70832,6 +130002,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -70846,48 +130017,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 457 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 813 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70895,47 +130074,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70943,15 +130122,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70963,6 +130145,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -70972,6 +130155,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -70981,6 +130165,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -70995,33 +130180,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 458 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 814 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71034,9 +130229,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71044,47 +130239,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71092,15 +130287,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71112,6 +130310,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -71121,6 +130320,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -71130,6 +130330,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -71141,35 +130342,46 @@ TileB: 1 TotalIndices: 4 TransposeA: true - TransposeB: false - UseBeta: true - UseInitialStrides: false - SolutionIndex: 459 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 815 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71181,57 +130393,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71239,13 +130452,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71257,6 +130475,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -71266,6 +130485,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -71275,6 +130495,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -71289,33 +130510,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 460 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 816 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id036 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id038 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -71327,7 +130558,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -71337,10 +130569,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -71354,30 +130586,30 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71385,13 +130617,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71403,6 +130642,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -71412,6 +130652,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -71421,6 +130662,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -71435,45 +130677,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 461 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 817 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -71483,47 +130734,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71531,13 +130782,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71549,6 +130807,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -71558,6 +130817,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -71567,6 +130827,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -71581,45 +130842,54 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 462 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 818 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -71629,47 +130899,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71677,13 +130947,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71695,6 +130972,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -71704,6 +130982,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -71713,6 +130992,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -71727,54 +131007,63 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 463 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 819 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -71783,39 +131072,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71823,13 +131112,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71841,6 +131137,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -71850,6 +131147,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -71859,6 +131157,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -71873,95 +131172,104 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 464 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 820 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id039 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id037 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71969,13 +131277,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71987,6 +131302,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -71996,6 +131312,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -72005,6 +131322,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -72019,33 +131337,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 465 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id037 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72057,57 +131383,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72115,13 +131442,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72133,6 +131465,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -72142,6 +131475,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -72151,6 +131485,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -72165,33 +131500,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 466 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 822 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id036 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id038 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72203,42 +131548,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72250,10 +131596,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72262,12 +131608,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72279,6 +131630,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -72288,6 +131640,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -72297,6 +131650,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -72311,13 +131665,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 467 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id039 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 823 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -72327,17 +131689,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id040 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72349,40 +131713,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -72396,10 +131761,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72409,11 +131774,18 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72425,6 +131797,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -72434,6 +131807,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -72443,6 +131817,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -72457,28 +131832,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 468 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id039 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 824 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -72497,42 +131879,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72544,9 +131926,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72555,12 +131937,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -72578,6 +131962,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -72587,6 +131972,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -72596,6 +131982,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -72614,8 +132001,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 469 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 825 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72624,23 +132011,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -72660,19 +132045,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -72681,20 +132066,20 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72707,9 +132092,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72717,13 +132102,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72740,6 +132125,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -72749,6 +132135,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -72758,6 +132145,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -72776,28 +132164,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 470 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 826 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -72822,41 +132210,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72869,9 +132257,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72879,12 +132267,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -72902,6 +132290,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -72911,6 +132300,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -72920,6 +132310,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -72938,8 +132329,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 471 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 827 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72947,18 +132338,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -72983,42 +132374,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73031,9 +132422,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73041,11 +132432,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -73064,6 +132457,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -73073,6 +132467,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -73082,6 +132477,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -73100,8 +132496,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 472 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 828 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73109,24 +132505,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -73145,42 +132539,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73192,9 +132586,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73203,12 +132597,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73226,6 +132622,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -73235,6 +132632,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -73244,6 +132642,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -73262,8 +132661,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 473 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 829 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73271,24 +132670,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -73308,41 +132705,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73355,9 +132752,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73365,11 +132762,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -73388,6 +132785,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -73397,6 +132795,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -73406,6 +132805,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -73424,8 +132824,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 474 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 830 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73433,18 +132833,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -73469,42 +132869,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73516,9 +132916,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73527,12 +132927,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73550,6 +132952,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -73559,6 +132962,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -73568,6 +132972,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -73586,8 +132991,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 475 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 831 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73595,24 +133000,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -73632,41 +133035,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73678,9 +133081,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73689,12 +133092,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73712,6 +133115,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -73721,6 +133125,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -73730,6 +133135,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -73748,8 +133154,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 476 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 832 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73757,20 +133163,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73793,8 +133199,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -73803,32 +133209,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73840,9 +133246,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73851,8 +133257,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -73874,6 +133282,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -73883,6 +133292,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -73892,6 +133302,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -73910,33 +133321,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 477 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + SolutionIndex: 833 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -73965,10 +133374,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -73982,15 +133391,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74003,9 +133412,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74013,12 +133422,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -74036,6 +133445,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -74045,6 +133455,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -74054,6 +133465,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -74072,8 +133484,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 478 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 834 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74081,20 +133493,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74110,16 +133522,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -74130,28 +133542,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 32 - LVCA: 2 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 576 + LdsPadA: 1 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -74163,10 +133571,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -74175,13 +133583,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -74191,13 +133599,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -74207,6 +133616,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -74216,6 +133626,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -74234,31 +133645,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 479 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 835 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -74278,43 +133689,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74326,10 +133733,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74337,13 +133744,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -74353,13 +133762,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -74369,6 +133779,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -74378,6 +133789,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -74396,33 +133808,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 480 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 836 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74440,43 +133850,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74488,10 +133894,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74499,13 +133905,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -74515,13 +133923,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -74531,6 +133940,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -74540,6 +133950,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -74558,33 +133969,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 481 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 837 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74602,8 +134011,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -74616,7 +134025,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74630,15 +134039,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74650,10 +134055,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74661,12 +134066,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -74677,13 +134084,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -74693,6 +134101,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -74702,6 +134111,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -74720,8 +134130,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 482 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 838 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74729,24 +134139,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -74778,7 +134186,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74792,11 +134200,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74808,9 +134216,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -74819,11 +134227,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -74842,6 +134250,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -74851,6 +134260,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -74860,6 +134270,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -74878,29 +134289,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 483 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 839 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74922,8 +134333,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -74936,7 +134347,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74950,15 +134361,11 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74970,9 +134377,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -74981,11 +134388,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -74997,13 +134406,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -75013,6 +134423,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -75022,6 +134433,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -75040,8 +134452,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 484 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 840 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75049,24 +134461,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75084,7 +134494,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -75098,29 +134508,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75132,10 +134538,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75143,13 +134549,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -75159,13 +134565,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -75175,6 +134582,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -75184,6 +134592,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -75202,29 +134611,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 485 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 841 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -75246,8 +134655,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75260,29 +134669,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75294,10 +134699,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75305,13 +134710,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -75321,13 +134728,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -75337,6 +134745,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -75346,6 +134755,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -75364,33 +134774,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 486 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 842 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75402,13 +134810,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -75422,29 +134830,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 2 - LdsPadB: 0 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75455,11 +134859,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75467,8 +134871,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -75483,13 +134887,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -75499,6 +134904,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -75508,6 +134914,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -75526,31 +134933,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 487 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + SolutionIndex: 843 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75570,8 +134977,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75581,32 +134988,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75619,9 +135022,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75630,12 +135033,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -75645,13 +135050,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -75661,6 +135067,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -75670,6 +135077,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -75688,15 +135096,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 488 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 844 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -75708,13 +135116,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -75732,10 +135138,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -75743,10 +135149,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -75754,21 +135160,17 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75780,10 +135182,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75792,10 +135194,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -75807,13 +135209,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -75823,6 +135226,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -75832,6 +135236,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -75850,15 +135255,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 489 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 845 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -75870,9 +135275,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -75894,8 +135299,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -75905,32 +135310,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75942,10 +135343,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75953,13 +135354,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -75969,13 +135372,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -75985,6 +135389,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -75994,6 +135399,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -76012,33 +135418,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 490 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 846 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76070,7 +135474,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -76084,11 +135488,11 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76100,9 +135504,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76111,11 +135515,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -76134,6 +135538,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -76143,6 +135548,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -76152,6 +135558,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -76170,8 +135577,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 491 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 847 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76180,17 +135587,17 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -76215,7 +135622,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76228,25 +135635,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76258,10 +135665,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76269,13 +135676,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -76292,6 +135701,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -76301,6 +135711,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -76310,6 +135721,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -76328,33 +135740,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 492 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 848 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76366,14 +135776,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76383,32 +135793,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76419,10 +135825,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 32 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76431,12 +135837,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -76447,13 +135855,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -76463,6 +135872,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -76472,6 +135882,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -76490,8 +135901,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 493 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 849 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76500,23 +135911,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76528,16 +135937,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -76545,32 +135954,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 - LVCA: 4 + LSPB: 32 + LVCA: 2 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 864 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76581,11 +135986,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76594,12 +135999,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -76609,13 +136014,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -76625,6 +136031,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -76634,6 +136041,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -76652,15 +136060,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 494 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 850 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -76672,11 +136080,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -76696,10 +136104,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -76710,29 +136118,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76744,9 +136148,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76756,12 +136160,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -76771,13 +136177,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -76787,6 +136194,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -76796,6 +136204,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -76814,16 +136223,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 495 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 851 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -76834,13 +136243,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -76858,7 +136265,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -76869,10 +136276,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -76886,15 +136293,11 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76918,7 +136321,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -76933,13 +136336,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -76949,6 +136353,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -76958,6 +136363,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -76976,8 +136382,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 496 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 852 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76996,9 +136402,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -77014,16 +136420,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -77039,24 +136445,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 576 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77067,10 +136469,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 256 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77079,13 +136481,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -77095,13 +136497,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -77111,6 +136514,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -77120,6 +136524,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -77138,31 +136543,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 497 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 853 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -77176,13 +136581,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -77201,24 +136606,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77229,11 +136630,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77241,13 +136642,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -77257,13 +136658,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -77273,6 +136675,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -77282,6 +136685,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -77300,37 +136704,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 498 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 854 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -77355,13 +136759,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -77372,14 +136777,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -77392,9 +136797,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77403,11 +136808,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -77419,6 +136824,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -77426,6 +136832,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -77435,6 +136842,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -77444,6 +136852,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -77462,8 +136871,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 499 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 855 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77472,27 +136881,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 32 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -77507,16 +136916,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -77524,24 +136933,25 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -77554,10 +136964,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77565,12 +136975,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -77581,6 +136993,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -77588,6 +137001,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -77597,6 +137011,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -77606,6 +137021,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -77624,8 +137040,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 500 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 856 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77633,12 +137049,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -77649,12 +137065,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -77669,7 +137083,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77686,6 +137100,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -77696,14 +137111,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -77717,9 +137132,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77727,12 +137142,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -77743,13 +137160,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -77759,6 +137178,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -77768,6 +137188,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -77786,8 +137207,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 501 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 857 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77795,12 +137216,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -77811,12 +137232,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -77830,10 +137249,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -77844,40 +137263,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2112 - LdsPadA: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77885,11 +137309,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -77901,13 +137325,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -77917,6 +137343,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -77926,6 +137353,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -77944,28 +137372,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 502 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 858 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -77974,7 +137402,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -77990,7 +137418,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -78006,39 +137434,40 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -78047,11 +137476,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -78063,6 +137492,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -78070,6 +137500,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -78079,6 +137510,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -78088,6 +137520,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -78106,28 +137539,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 503 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 859 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -78136,7 +137569,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -78144,23 +137577,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -78168,24 +137601,25 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -78197,7 +137631,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -78211,11 +137645,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78225,6 +137662,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -78232,6 +137670,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -78241,6 +137680,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -78250,6 +137690,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -78268,8 +137709,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 504 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 860 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78277,7 +137718,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -78290,15 +137731,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -78306,15 +137745,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -78322,7 +137761,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -78330,21 +137769,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2112 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78355,11 +137799,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78369,11 +137813,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78383,13 +137830,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -78399,6 +137848,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -78408,6 +137858,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -78426,15 +137877,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 505 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 861 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -78447,16 +137898,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -78470,8 +137919,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78481,32 +137930,29 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 0 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78519,9 +137965,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78530,12 +137976,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78545,13 +137994,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -78561,6 +138012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -78570,6 +138022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -78588,15 +138041,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 506 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 862 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -78608,17 +138061,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -78626,14 +138077,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -78650,25 +138101,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 + LdsOffsetB: 1088 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78679,11 +138127,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78693,11 +138141,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78707,13 +138158,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -78723,6 +138176,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -78732,6 +138186,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -78750,16 +138205,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 507 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 863 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -78771,16 +138226,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -78788,7 +138241,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78797,7 +138250,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -78808,29 +138261,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 64 - LVCA: 2 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3600 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78841,10 +138295,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -78853,13 +138307,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78869,6 +138324,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -78876,6 +138332,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -78885,6 +138342,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -78894,6 +138352,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -78912,8 +138371,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 508 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 864 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78921,28 +138380,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -78956,7 +138415,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -78967,32 +138426,29 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79005,9 +138461,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79016,12 +138472,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79031,13 +138488,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -79047,6 +138506,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -79056,6 +138516,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -79074,15 +138535,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 509 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 865 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -79094,9 +138555,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -79104,7 +138565,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -79121,7 +138582,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79129,47 +138590,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6176 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79177,13 +138639,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79193,6 +138656,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -79200,6 +138664,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -79209,6 +138674,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -79218,6 +138684,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -79236,15 +138703,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 510 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 866 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -79256,8 +138723,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -79266,7 +138733,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -79274,64 +138741,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6176 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79339,13 +138807,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79355,6 +138826,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -79362,6 +138834,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -79371,6 +138844,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -79380,6 +138854,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -79398,37 +138873,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 511 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 867 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -79436,15 +138909,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -79453,46 +138926,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 32 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -79501,13 +138975,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79517,6 +138994,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -79524,6 +139002,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -79533,6 +139012,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -79542,6 +139022,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -79560,37 +139041,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 512 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 868 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -79598,53 +139077,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -79652,10 +139132,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79663,13 +139143,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79679,6 +139162,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -79686,6 +139170,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -79695,6 +139180,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -79704,6 +139190,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -79722,37 +139209,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 513 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 869 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -79760,16 +139245,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79780,44 +139265,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSPB: 8 + LVCA: 8 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79825,13 +139311,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79841,6 +139330,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -79848,6 +139338,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -79857,6 +139348,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -79866,6 +139358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -79884,37 +139377,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 514 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 870 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -79928,54 +139419,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 16 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79983,13 +139479,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79999,13 +139496,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -80015,6 +139514,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -80024,6 +139524,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -80042,28 +139543,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 515 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 871 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -80072,7 +139573,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -80087,9 +139588,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -80097,47 +139598,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80145,13 +139647,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -80161,6 +139666,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -80168,6 +139674,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -80177,6 +139684,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -80186,6 +139694,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -80204,37 +139713,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 516 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 872 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -80242,7 +139749,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80251,7 +139758,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -80259,32 +139766,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 96 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80295,10 +139803,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80307,13 +139815,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -80323,6 +139832,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -80330,6 +139840,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -80339,6 +139850,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -80348,6 +139860,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -80366,8 +139879,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 517 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 873 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80376,27 +139889,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -80404,15 +139917,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -80428,25 +139941,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80457,7 +139971,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -80471,11 +139985,14 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -80485,13 +140002,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -80501,6 +140020,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -80510,6 +140030,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -80528,37 +140049,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 518 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 874 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -80566,15 +140085,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -80590,25 +140109,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13376 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80619,25 +140139,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 256 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 256 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -80647,6 +140170,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -80654,6 +140178,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -80663,6 +140188,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -80672,6 +140198,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -80690,20 +140217,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 519 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + SolutionIndex: 875 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -80711,16 +140238,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -80728,16 +140253,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -80752,25 +140277,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80781,11 +140307,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80795,11 +140321,14 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -80809,13 +140338,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -80825,6 +140356,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -80834,6 +140366,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -80852,8 +140385,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 520 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 876 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80861,12 +140394,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80876,13 +140409,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -80896,10 +140427,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -80914,21 +140445,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -80941,9 +140477,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80951,13 +140487,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -80967,13 +140506,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -80983,6 +140524,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -80992,6 +140534,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81010,37 +140553,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 521 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 877 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81048,7 +140589,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81065,32 +140606,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81101,11 +140643,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81113,13 +140655,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81129,6 +140672,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -81136,6 +140680,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -81145,6 +140690,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81154,6 +140700,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81172,37 +140719,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 522 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 878 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81217,7 +140764,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -81227,13 +140774,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -81244,15 +140792,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81277,11 +140825,14 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81291,6 +140842,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -81298,6 +140850,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -81307,6 +140860,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81316,6 +140870,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81334,8 +140889,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 523 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 879 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81359,12 +140914,10 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81372,7 +140925,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81381,7 +140934,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -81396,25 +140949,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81425,11 +140979,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81437,13 +140991,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81453,6 +141008,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -81460,6 +141016,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -81469,6 +141026,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81478,6 +141036,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81496,20 +141055,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 524 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 880 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -81517,16 +141076,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81558,6 +141117,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -81568,30 +141128,30 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12864 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 4160 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 8704 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 256 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 256 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81599,13 +141159,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81615,6 +141176,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -81622,6 +141184,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -81631,6 +141194,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81640,6 +141204,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81658,29 +141223,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 525 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + SolutionIndex: 881 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -81688,7 +141253,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81703,42 +141268,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81751,23 +141317,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81777,6 +141348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -81784,6 +141356,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -81793,6 +141366,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81802,8 +141376,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -81820,37 +141396,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 526 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 882 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -81865,16 +141439,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -81882,25 +141456,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81913,23 +141488,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81939,6 +141519,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -81946,6 +141527,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -81955,6 +141537,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81964,8 +141547,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -81982,8 +141567,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 527 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 883 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81991,12 +141576,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -82004,15 +141589,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82027,7 +141610,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -82044,6 +141627,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -82054,15 +141638,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82074,24 +141658,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82101,6 +141690,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -82108,6 +141698,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -82117,6 +141708,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82126,8 +141718,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -82144,37 +141738,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 528 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 884 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82189,7 +141781,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -82199,32 +141791,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82237,23 +141830,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82263,6 +141861,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -82270,6 +141869,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -82279,6 +141879,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82288,8 +141889,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -82306,15 +141909,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 529 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 885 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -82327,16 +141930,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82344,7 +141945,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82352,41 +141953,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82397,15 +141999,17 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -82413,9 +142017,10 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82425,6 +142030,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -82432,6 +142038,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -82441,6 +142048,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82450,8 +142058,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -82468,15 +142078,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 530 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 886 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -82489,16 +142099,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82506,7 +142116,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82526,29 +142136,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82559,25 +142170,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82587,6 +142201,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -82594,6 +142209,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -82603,6 +142219,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82612,8 +142229,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -82630,16 +142249,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 531 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 887 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -82650,17 +142269,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82668,7 +142287,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82677,7 +142296,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82688,29 +142307,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3408 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82721,25 +142341,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82749,6 +142372,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -82756,6 +142380,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -82765,6 +142390,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82774,8 +142400,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -82792,16 +142420,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 532 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 888 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -82812,17 +142440,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82836,7 +142464,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -82847,32 +142475,29 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82885,23 +142510,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82911,13 +142539,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -82927,6 +142557,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82936,8 +142567,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -82954,15 +142587,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 533 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 889 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -82974,9 +142607,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -82984,7 +142617,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -82999,42 +142632,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83046,24 +142680,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83073,6 +142712,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -83080,6 +142720,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -83089,6 +142730,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83098,8 +142740,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -83116,37 +142760,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 534 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 890 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -83161,42 +142803,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83209,23 +142852,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83235,6 +142883,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -83242,6 +142891,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -83251,6 +142901,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83260,8 +142911,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -83278,8 +142931,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 535 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 891 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -83288,11 +142941,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -83300,15 +142953,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -83316,15 +142967,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -83333,32 +142984,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83369,7 +143021,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -83377,17 +143029,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83397,13 +143054,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -83413,6 +143072,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83422,8 +143082,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -83440,8 +143102,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 536 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 892 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -83460,17 +143122,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -83485,9 +143145,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -83495,32 +143155,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83532,24 +143193,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83559,13 +143225,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -83575,6 +143243,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83584,8 +143253,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -83602,37 +143273,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 537 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 893 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -83647,7 +143316,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -83657,32 +143326,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83695,23 +143365,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83721,6 +143396,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -83728,6 +143404,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -83737,6 +143414,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83746,8 +143424,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -83764,37 +143444,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 538 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 894 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -83808,8 +143486,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -83819,13 +143497,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -83836,11 +143515,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83853,23 +143536,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83879,13 +143567,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -83895,6 +143585,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83904,8 +143595,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -83922,8 +143615,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 539 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 895 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -83932,27 +143625,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -83960,7 +143651,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83968,41 +143659,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84013,25 +143705,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84041,6 +143736,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -84048,6 +143744,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -84057,6 +143754,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84066,8 +143764,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -84084,8 +143784,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 540 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 896 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84094,27 +143794,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -84139,13 +143839,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -84156,15 +143857,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84176,14 +143877,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -84191,9 +143894,10 @@ NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84203,6 +143907,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -84210,6 +143915,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -84219,6 +143925,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84228,8 +143935,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -84246,8 +143955,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 897 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84256,11 +143965,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -84268,7 +143977,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84276,7 +143985,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -84284,7 +143993,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84292,7 +144001,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -84301,32 +144010,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84337,25 +144047,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84365,13 +144078,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -84381,6 +144096,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84390,8 +144106,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -84408,15 +144126,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 + SolutionIndex: 898 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 @@ -84428,17 +144146,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -84466,29 +144184,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84501,23 +144220,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84527,6 +144249,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -84534,6 +144257,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -84543,6 +144267,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84552,8 +144277,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -84570,29 +144297,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 899 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84600,7 +144327,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -84628,10 +144355,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -84642,15 +144370,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84663,23 +144391,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84689,6 +144420,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -84696,6 +144428,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -84705,6 +144438,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84714,8 +144448,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -84732,8 +144468,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 900 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84741,18 +144477,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -84762,7 +144498,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -84770,49 +144506,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84823,7 +144560,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -84831,17 +144568,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84851,13 +144593,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -84867,6 +144611,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84876,8 +144621,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -84894,8 +144641,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 901 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84903,7 +144650,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -84914,17 +144661,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -84932,49 +144677,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84985,25 +144731,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85013,13 +144764,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85029,6 +144782,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85038,8 +144792,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -85056,15 +144812,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 902 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -85076,17 +144832,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -85094,16 +144848,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -85111,32 +144865,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85147,25 +144902,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85175,13 +144935,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85191,6 +144953,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85200,8 +144963,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -85218,20 +144983,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 903 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -85239,16 +145004,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -85256,16 +145019,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -85273,32 +145036,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85309,25 +145073,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85337,13 +145106,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85353,6 +145124,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85362,8 +145134,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -85380,15 +145154,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 904 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -85400,17 +145174,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -85425,7 +145197,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -85435,13 +145207,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -85452,15 +145225,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85472,24 +145245,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85499,6 +145277,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -85506,6 +145285,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85515,6 +145295,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85524,8 +145305,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -85542,8 +145325,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 905 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85552,27 +145335,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -85580,7 +145361,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85588,41 +145369,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85633,25 +145415,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85661,13 +145446,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85677,6 +145464,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85686,8 +145474,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -85704,8 +145494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 906 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85714,27 +145504,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -85742,7 +145532,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85750,41 +145540,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85795,25 +145586,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85823,13 +145617,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85839,6 +145635,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85848,8 +145645,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -85866,8 +145665,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 907 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85875,28 +145674,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -85904,7 +145703,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85912,41 +145711,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85957,25 +145757,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85985,6 +145788,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -85992,6 +145796,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86001,6 +145806,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86010,8 +145816,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -86028,8 +145836,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 908 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86037,28 +145845,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -86066,7 +145874,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86075,7 +145883,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -86083,32 +145891,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86119,15 +145928,17 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -86136,8 +145947,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86147,13 +145959,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86163,6 +145977,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86172,8 +145987,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -86190,8 +146007,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 909 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86200,11 +146017,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -86212,15 +146029,15 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -86228,7 +146045,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86237,7 +146054,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -86245,32 +146062,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86281,7 +146099,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -86289,7 +146107,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -86297,9 +146117,10 @@ NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86309,13 +146130,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86325,6 +146148,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86334,8 +146158,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -86352,8 +146178,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 910 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86374,15 +146200,15 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -86390,7 +146216,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86399,7 +146225,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -86407,32 +146233,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86443,25 +146270,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86471,13 +146301,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86487,6 +146319,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86496,8 +146329,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -86514,37 +146349,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 911 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -86552,7 +146387,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86561,7 +146396,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -86569,32 +146404,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86605,25 +146441,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86633,6 +146472,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -86640,6 +146480,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86649,6 +146490,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86658,8 +146500,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -86676,8 +146520,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 912 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86686,27 +146530,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -86714,49 +146558,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 9280 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86767,25 +146612,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86795,13 +146645,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86811,6 +146663,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86820,8 +146673,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -86838,37 +146693,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 913 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -86876,23 +146729,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -86900,25 +146753,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14464 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86929,25 +146783,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86957,13 +146816,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86973,6 +146834,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86982,8 +146844,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -87000,37 +146864,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 914 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -87038,49 +146900,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87091,25 +146954,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87119,13 +146987,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87135,6 +147005,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87144,8 +147015,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -87162,8 +147035,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 915 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87172,27 +147045,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -87200,15 +147071,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -87216,7 +147087,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -87224,25 +147095,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 4 - LVPA: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3424 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87253,25 +147125,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87281,6 +147158,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -87288,6 +147166,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87297,6 +147176,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87306,8 +147186,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -87324,37 +147206,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 916 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -87362,15 +147242,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -87378,7 +147258,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -87386,25 +147266,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87415,25 +147296,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87443,6 +147329,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -87450,6 +147337,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87459,6 +147347,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87468,8 +147357,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -87486,20 +147377,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 917 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -87507,16 +147398,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -87531,7 +147420,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -87544,29 +147433,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87578,24 +147468,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87605,6 +147500,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -87612,6 +147508,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87621,6 +147518,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87630,8 +147528,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -87648,37 +147548,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -87693,8 +147591,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -87710,25 +147608,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87740,24 +147639,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87767,6 +147671,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -87774,6 +147679,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87783,6 +147689,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87792,8 +147699,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -87810,37 +147719,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -87856,41 +147763,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87902,24 +147810,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87929,13 +147840,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87945,6 +147858,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87954,8 +147868,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -87972,8 +147888,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87981,18 +147897,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -88002,7 +147918,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -88018,41 +147934,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88065,23 +147982,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88091,6 +148011,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -88098,6 +148019,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88107,6 +148029,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88116,8 +148039,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -88134,8 +148059,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 921 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88143,18 +148068,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -88164,7 +148089,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -88180,7 +148105,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -88188,7 +148113,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -88196,25 +148121,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88227,23 +148153,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88253,13 +148182,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88269,6 +148200,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88278,8 +148210,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -88296,29 +148230,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -88326,7 +148260,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -88342,15 +148276,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -88358,25 +148292,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88389,23 +148324,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88415,6 +148353,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -88422,6 +148361,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88431,6 +148371,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88440,8 +148381,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -88458,20 +148401,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -88479,7 +148422,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -88488,7 +148431,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -88502,9 +148445,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -88512,7 +148455,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -88520,21 +148463,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4736 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 4160 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88546,24 +148494,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88573,13 +148524,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88589,6 +148542,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88598,8 +148552,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -88616,29 +148572,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 924 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -88646,7 +148602,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -88671,13 +148627,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -88688,15 +148645,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88709,23 +148666,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88735,13 +148695,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88751,6 +148713,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88760,8 +148723,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -88778,8 +148743,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 925 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88788,17 +148753,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -88808,7 +148773,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -88833,32 +148798,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88870,24 +148836,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88897,13 +148866,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88913,6 +148884,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88922,8 +148894,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -88940,29 +148914,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -88970,7 +148944,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89002,6 +148976,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -89012,15 +148987,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89032,24 +149007,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89059,6 +149037,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -89066,6 +149045,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89075,6 +149055,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89084,8 +149065,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -89102,29 +149085,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 927 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -89132,7 +149115,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89164,6 +149147,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -89174,15 +149158,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89194,24 +149178,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89221,6 +149208,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -89228,6 +149216,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89237,6 +149226,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89246,8 +149236,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -89264,8 +149256,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 928 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89273,12 +149265,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -89294,7 +149286,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89309,8 +149301,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -89326,54 +149318,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89383,6 +149381,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -89390,6 +149389,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89399,6 +149399,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89408,8 +149409,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -89426,37 +149429,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 929 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89471,8 +149472,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -89488,54 +149489,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89545,6 +149552,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -89552,6 +149560,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89561,6 +149570,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89570,8 +149580,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -89588,20 +149600,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 930 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -89609,16 +149621,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89633,9 +149643,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89646,58 +149656,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89707,6 +149723,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -89714,6 +149731,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89723,6 +149741,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89732,8 +149751,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -89750,37 +149771,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 931 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89788,15 +149807,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -89804,7 +149823,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -89812,21 +149831,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2688 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2112 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89837,25 +149861,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89865,13 +149894,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89881,6 +149912,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89890,14 +149922,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -89908,15 +149943,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 932 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -89929,16 +149964,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -89946,15 +149979,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -89962,7 +149995,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -89970,25 +150003,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89999,25 +150033,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90027,6 +150066,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -90034,6 +150074,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90043,6 +150084,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90052,14 +150094,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -90070,16 +150115,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 933 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -90091,16 +150136,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90108,23 +150151,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90132,25 +150175,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -90161,25 +150205,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90189,6 +150238,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -90196,6 +150246,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90205,6 +150256,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90214,14 +150266,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -90232,37 +150287,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 934 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90270,16 +150323,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -90287,32 +150340,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -90323,25 +150377,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90351,6 +150410,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -90358,6 +150418,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90367,6 +150428,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90376,14 +150438,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -90394,15 +150459,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 935 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -90415,16 +150480,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90432,16 +150495,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -90456,25 +150519,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -90485,25 +150549,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90513,6 +150582,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -90520,6 +150590,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90529,6 +150600,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90538,14 +150610,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -90556,16 +150631,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 936 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -90577,16 +150652,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90600,9 +150673,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -90618,25 +150691,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -90648,24 +150718,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90675,13 +150750,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90691,6 +150768,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90700,14 +150778,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -90718,16 +150799,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 937 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -90739,16 +150820,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90763,7 +150842,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90780,6 +150859,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -90790,15 +150870,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -90811,23 +150891,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90837,6 +150922,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -90844,6 +150930,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90853,6 +150940,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90862,14 +150950,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -90880,20 +150971,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 938 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -90901,16 +150992,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -90925,7 +151014,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90942,6 +151031,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -90952,15 +151042,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -90979,17 +151069,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90999,6 +151094,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -91006,6 +151102,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91015,6 +151112,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91024,14 +151122,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -91042,8 +151143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 939 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91051,7 +151152,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -91064,15 +151165,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91080,7 +151179,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91088,15 +151187,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -91104,25 +151203,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91133,25 +151233,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91161,13 +151264,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91177,6 +151282,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91186,14 +151292,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -91204,8 +151313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 940 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91213,12 +151322,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -91226,15 +151335,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91242,7 +151351,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91250,15 +151359,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -91266,25 +151375,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91295,25 +151405,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91323,6 +151436,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -91330,6 +151444,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91339,6 +151454,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91348,14 +151464,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -91366,8 +151485,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 941 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91375,11 +151494,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -91388,15 +151507,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91404,23 +151523,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -91428,25 +151547,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91457,25 +151573,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91485,13 +151604,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91501,6 +151622,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91510,14 +151632,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -91528,8 +151653,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 942 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91537,12 +151662,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -91550,15 +151675,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91575,7 +151700,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91590,25 +151715,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91620,14 +151746,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -91635,9 +151763,10 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91647,13 +151776,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91663,6 +151794,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91672,14 +151804,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -91690,16 +151825,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 943 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -91711,8 +151846,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91720,7 +151855,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91728,7 +151863,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91736,8 +151871,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91752,25 +151887,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91781,25 +151917,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91809,6 +151948,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -91816,6 +151956,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91825,6 +151966,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91834,14 +151976,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -91852,37 +151997,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 944 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91890,7 +152035,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91899,7 +152044,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91907,32 +152052,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91943,15 +152089,17 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -91960,8 +152108,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91971,6 +152120,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -91978,6 +152128,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91987,6 +152138,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91996,14 +152148,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -92014,15 +152169,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 945 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -92035,16 +152190,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -92052,13 +152207,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92076,25 +152231,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92105,15 +152257,17 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -92122,8 +152276,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92133,13 +152288,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92149,6 +152306,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92158,14 +152316,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -92176,16 +152337,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 946 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -92197,16 +152358,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -92214,7 +152375,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -92223,7 +152384,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -92234,29 +152395,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92267,25 +152429,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92295,6 +152460,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -92302,6 +152468,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92311,6 +152478,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92320,14 +152488,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -92338,8 +152509,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92347,28 +152518,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -92376,7 +152547,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -92385,7 +152556,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -92393,32 +152564,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92429,25 +152601,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92457,6 +152632,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -92464,6 +152640,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92473,6 +152650,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92482,14 +152660,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -92500,37 +152681,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 948 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -92538,23 +152719,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -92562,25 +152743,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92591,25 +152773,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92619,6 +152806,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -92626,6 +152814,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92635,6 +152824,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92644,14 +152834,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -92662,8 +152855,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 949 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92671,12 +152864,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -92684,15 +152877,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -92700,15 +152891,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -92720,29 +152911,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92753,7 +152945,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -92761,17 +152953,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92781,6 +152978,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -92788,6 +152986,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92797,6 +152996,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92806,14 +153006,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -92824,8 +153027,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 950 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92833,7 +153036,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -92844,17 +153047,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -92862,15 +153063,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -92886,25 +153087,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92915,25 +153117,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92943,6 +153150,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -92950,6 +153158,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92959,6 +153168,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92968,14 +153178,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -92986,8 +153199,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 951 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92995,12 +153208,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -93008,15 +153221,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -93024,15 +153235,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -93048,25 +153259,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93077,7 +153289,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -93085,17 +153297,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93105,6 +153322,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -93112,6 +153330,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93121,6 +153340,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93130,14 +153350,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -93148,8 +153371,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 952 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93157,7 +153380,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -93170,15 +153393,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -93186,7 +153407,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93194,7 +153415,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -93202,31 +153423,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -93239,25 +153461,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93267,6 +153492,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -93274,6 +153500,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93293,15 +153520,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -93312,8 +153541,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 953 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93321,28 +153550,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -93350,16 +153579,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93370,23 +153599,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -93403,25 +153633,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93431,13 +153666,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93457,15 +153694,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -93476,8 +153715,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 954 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93485,28 +153724,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -93514,14 +153751,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -93534,58 +153771,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93595,6 +153838,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -93602,6 +153846,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93621,15 +153866,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -93640,37 +153887,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 + SolutionIndex: 955 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -93686,39 +153931,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -93732,24 +153978,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93759,6 +154008,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -93766,6 +154016,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93785,15 +154036,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -93804,29 +154057,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 600 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 + SolutionIndex: 956 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -93834,7 +154087,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -93842,13 +154095,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 @@ -93862,58 +154115,58 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93923,13 +154176,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93949,15 +154204,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -93968,16 +154225,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 601 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 + SolutionIndex: 957 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -93988,17 +154245,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94006,7 +154263,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94015,7 +154272,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94026,58 +154283,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94087,6 +154348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -94094,6 +154356,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94113,15 +154376,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -94132,37 +154397,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 602 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 + SolutionIndex: 958 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94170,7 +154435,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94179,7 +154444,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94190,58 +154455,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94251,13 +154520,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94277,15 +154548,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -94296,37 +154569,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 603 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 + SolutionIndex: 959 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94354,27 +154627,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 + LSCB: 8 + LSPA: 128 + LSPB: 128 LVCA: 2 LVCB: 2 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94388,24 +154662,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94415,6 +154692,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -94422,6 +154700,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94441,15 +154720,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -94460,29 +154741,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 604 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 + SolutionIndex: 960 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -94490,7 +154771,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94498,7 +154779,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94507,7 +154788,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94515,61 +154796,65 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94579,6 +154864,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -94586,6 +154872,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94605,15 +154892,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -94624,37 +154913,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 605 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 + SolutionIndex: 961 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94662,53 +154951,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94717,23 +155007,28 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94743,6 +155038,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -94750,6 +155046,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94769,15 +155066,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -94788,37 +155087,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 606 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 + SolutionIndex: 962 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94833,8 +155130,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -94842,7 +155139,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -94850,54 +155147,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 LSPA: 32 - LSPB: 8 - LVCA: 2 + LSPB: 64 + LVCA: 8 LVCB: 4 - LVPA: 8 - LVPB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 + LdsOffsetA_Blk: 1024 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94907,6 +155210,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -94914,6 +155218,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94933,15 +155238,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -94952,16 +155259,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 607 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 + SolutionIndex: 963 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -94973,16 +155280,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 2, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94997,8 +155302,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -95014,54 +155319,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 128 + LSPA: 32 LSPB: 64 - LVCA: 2 + LVCA: 8 LVCB: 4 LVPA: 32 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95071,6 +155382,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95078,6 +155390,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95097,15 +155410,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -95116,20 +155431,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 608 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 964 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -95137,16 +155452,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95154,15 +155467,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -95178,29 +155491,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95209,23 +155523,28 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95235,6 +155554,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95242,6 +155562,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95261,15 +155582,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -95280,16 +155603,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 609 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 965 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -95301,16 +155624,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95318,7 +155639,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95326,15 +155647,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95342,17 +155663,18 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 @@ -95364,7 +155686,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95372,14 +155694,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -95388,8 +155712,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95399,6 +155724,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95406,6 +155732,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95425,15 +155752,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -95444,20 +155773,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 610 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 + SolutionIndex: 966 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 8 - SubGroupA: 2 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -95465,16 +155794,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [2, 8, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95482,23 +155811,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95506,50 +155835,58 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 32 LSPB: 64 LVCA: 4 LVCB: 4 LVPA: 16 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95559,13 +155896,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95585,15 +155924,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -95604,14 +155945,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 611 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 967 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -95625,16 +155966,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95642,23 +155983,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95666,29 +156007,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95696,24 +156038,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95723,6 +156070,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95730,6 +156078,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95749,15 +156098,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -95768,37 +156119,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 612 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 + SolutionIndex: 968 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95813,16 +156162,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95830,54 +156179,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 8 - LVCA: 2 - LVCB: 4 - LVPA: 8 - LVPB: 4 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95887,6 +156242,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95894,6 +156250,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95913,15 +156270,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -95932,16 +156291,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 613 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 + SolutionIndex: 969 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -95953,16 +156312,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 2, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95977,16 +156334,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95994,54 +156351,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96051,6 +156414,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96058,6 +156422,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96077,15 +156442,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -96096,16 +156463,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 614 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 + SolutionIndex: 970 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96117,16 +156484,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96134,16 +156499,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96158,54 +156523,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96215,6 +156586,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96222,6 +156594,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96241,15 +156614,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -96260,15 +156635,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 615 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 + SolutionIndex: 971 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -96280,17 +156655,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96298,16 +156671,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96322,54 +156695,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96379,6 +156758,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96386,6 +156766,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96405,15 +156786,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -96424,16 +156807,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 616 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 972 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96445,16 +156828,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96470,15 +156851,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -96486,54 +156867,58 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96543,6 +156928,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96550,6 +156936,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96569,15 +156956,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -96588,16 +156977,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 617 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 973 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96609,8 +156998,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -96618,7 +157007,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96626,7 +157015,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96634,15 +157023,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -96650,29 +157039,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 4 - LVPB: 4 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96680,14 +157070,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -96696,8 +157088,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96707,6 +157100,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96714,6 +157108,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96733,15 +157128,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -96752,20 +157149,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 618 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 + SolutionIndex: 974 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 2 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -96773,16 +157170,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [2, 8, 4] - WorkGroupMapping: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96807,61 +157204,65 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 + LSPA: 64 + LSPB: 32 LVCA: 4 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96871,6 +157272,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96878,6 +157280,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96897,15 +157300,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -96916,29 +157321,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 619 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 975 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -96946,7 +157351,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96954,7 +157359,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96963,7 +157368,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96974,58 +157379,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97035,6 +157444,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97042,6 +157452,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97061,15 +157472,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -97080,16 +157493,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 620 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 976 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -97101,16 +157514,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97126,70 +157539,74 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97199,6 +157616,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97226,15 +157644,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -97245,28 +157665,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 621 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 977 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -97275,7 +157695,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97290,9 +157710,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97303,60 +157723,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97366,6 +157788,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97393,15 +157816,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -97412,35 +157837,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 622 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 978 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97455,9 +157882,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97468,60 +157895,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97531,6 +157960,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97558,15 +157988,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -97577,35 +158009,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 623 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 979 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97620,16 +158054,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -97637,54 +158071,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97694,6 +158134,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97721,15 +158162,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -97740,37 +158183,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 624 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 980 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97785,9 +158226,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97802,54 +158243,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97859,6 +158306,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97886,15 +158334,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -97905,37 +158355,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 625 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 981 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97950,8 +158398,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -97963,60 +158411,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 8 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 128 + LdsOffsetB_Blk: 2176 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98026,6 +158476,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98053,15 +158504,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -98072,35 +158525,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 626 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 982 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98116,7 +158571,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -98128,60 +158583,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98191,6 +158650,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98218,15 +158678,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -98237,35 +158699,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 627 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 983 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98280,8 +158742,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -98293,58 +158755,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98354,6 +158822,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98381,15 +158850,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -98400,37 +158871,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 628 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 984 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98445,8 +158914,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -98462,54 +158931,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98519,6 +158994,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98546,15 +159022,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -98565,37 +159043,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 629 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 985 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98611,7 +159087,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -98627,56 +159103,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98686,6 +159166,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98713,15 +159194,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -98732,35 +159215,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 630 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 986 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98776,7 +159259,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -98785,63 +159268,67 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98851,6 +159338,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98878,15 +159366,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -98897,35 +159387,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 631 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 987 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98940,71 +159430,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99014,6 +159510,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99041,15 +159538,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -99060,37 +159559,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 632 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 988 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99105,71 +159602,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99179,6 +159682,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99206,15 +159710,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -99225,37 +159731,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 633 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 989 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99272,71 +159776,75 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99346,6 +159854,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99373,15 +159882,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -99392,35 +159903,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 634 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 990 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99435,7 +159946,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -99444,64 +159955,66 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 LVCB: 8 LVPA: 16 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99511,6 +160024,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99538,15 +160052,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -99557,35 +160073,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 635 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 991 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99600,71 +160118,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99674,6 +160198,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99701,15 +160226,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -99720,37 +160247,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 636 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 992 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99765,71 +160290,77 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99839,6 +160370,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99866,15 +160398,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -99885,37 +160419,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 637 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 993 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99930,9 +160462,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -99940,61 +160472,67 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 16 LVCA: 4 - LVCB: 4 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7200 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100004,6 +160542,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100031,15 +160570,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -100050,37 +160591,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 638 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 994 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100096,7 +160635,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -100104,64 +160643,68 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 + LSPA: 64 LSPB: 16 - LVCA: 16 + LVCA: 4 LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100171,6 +160714,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100198,15 +160742,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -100217,35 +160763,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 639 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 995 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100261,7 +160807,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -100269,7 +160815,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100277,56 +160823,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 + LSPA: 64 LSPB: 16 - LVCA: 16 + LVCA: 4 LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100336,6 +160886,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100363,15 +160914,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -100382,35 +160935,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 640 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 996 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100418,23 +160971,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100442,56 +160995,58 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 2 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100501,6 +161056,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100528,15 +161084,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -100547,35 +161105,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 641 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 997 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100603,10 +161163,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -100617,15 +161178,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 + LdsNumElements: 6272 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100638,25 +161199,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100666,6 +161230,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100693,15 +161258,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -100712,8 +161279,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 642 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 998 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100721,18 +161288,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -100740,7 +161307,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100748,14 +161315,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -100768,29 +161335,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100801,27 +161369,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100831,8 +161400,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -100858,15 +161428,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -100877,8 +161449,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 643 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 999 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100886,26 +161458,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100913,7 +161487,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100921,15 +161495,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100937,23 +161511,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -100966,25 +161541,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100994,6 +161572,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101021,15 +161600,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -101040,8 +161621,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 644 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101049,28 +161630,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101085,16 +161666,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101102,23 +161683,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -101132,24 +161714,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101159,6 +161746,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101186,15 +161774,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -101205,8 +161795,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 645 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101214,12 +161804,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -101227,15 +161817,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101250,42 +161838,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101297,26 +161886,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101326,6 +161916,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101353,15 +161944,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -101372,8 +161965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 646 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101381,26 +161974,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101416,41 +162011,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101469,19 +162065,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101491,6 +162090,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101518,15 +162118,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -101537,8 +162139,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 647 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101546,7 +162148,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -101557,7 +162159,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -101565,7 +162167,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101573,23 +162175,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101597,25 +162199,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 LdsPadA: 4 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101626,7 +162229,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -101634,17 +162237,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101654,6 +162262,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101681,15 +162290,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -101700,8 +162311,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 648 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101709,14 +162320,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -101724,13 +162335,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101745,42 +162354,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101792,24 +162402,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101819,6 +162434,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101846,15 +162462,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -101865,8 +162483,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 649 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101874,28 +162492,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101911,15 +162527,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101927,23 +162543,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -101957,26 +162574,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101986,6 +162606,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102013,15 +162634,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -102032,35 +162655,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 650 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102076,15 +162699,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102092,23 +162715,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -102122,26 +162746,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102151,6 +162778,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102178,15 +162806,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -102197,20 +162827,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 651 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -102218,14 +162848,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102240,42 +162870,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102287,24 +162918,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102314,6 +162950,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102341,15 +162978,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -102360,37 +162999,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 652 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102398,7 +163035,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102406,41 +163043,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102451,27 +163089,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102481,6 +163122,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102508,15 +163150,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -102527,35 +163171,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 653 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102563,15 +163207,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -102583,58 +163227,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102644,6 +163294,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102671,15 +163322,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -102690,37 +163343,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 654 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102728,7 +163379,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102736,65 +163387,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -102802,6 +163456,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102811,6 +163466,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102838,15 +163494,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -102857,35 +163515,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 655 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102893,78 +163551,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102974,6 +163638,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -103001,15 +163666,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -103020,37 +163687,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 656 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103058,15 +163723,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -103082,50 +163747,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 32 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103135,7 +163810,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103162,15 +163838,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -103181,37 +163859,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 657 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103219,23 +163895,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -103243,52 +163919,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103298,7 +163982,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103325,15 +164010,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -103344,35 +164031,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 658 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103380,76 +164067,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103459,8 +164154,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103486,15 +164182,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -103505,35 +164203,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 659 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103541,23 +164239,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -103565,52 +164263,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103620,7 +164326,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103647,15 +164354,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -103666,35 +164375,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 660 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103702,23 +164411,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -103726,50 +164435,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103779,7 +164498,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103806,15 +164526,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -103825,37 +164547,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 661 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103863,23 +164583,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -103887,52 +164607,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103942,7 +164670,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103969,15 +164698,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -103988,35 +164719,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 662 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104024,16 +164755,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104048,50 +164779,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSPB: 8 + LVCA: 8 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104101,7 +164842,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104128,15 +164870,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -104147,37 +164891,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 663 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104185,16 +164927,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104209,52 +164951,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSPB: 8 + LVCA: 8 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104264,7 +165014,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104291,15 +165042,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -104310,35 +165063,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 664 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104346,74 +165099,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104423,7 +165186,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104450,15 +165214,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -104469,37 +165235,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 665 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104507,76 +165271,82 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104586,7 +165356,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104613,15 +165384,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -104632,35 +165405,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 666 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104668,74 +165443,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104745,7 +165530,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104772,15 +165558,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -104791,15 +165579,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 667 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -104811,17 +165599,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104829,16 +165615,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104846,59 +165632,67 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104908,7 +165702,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104935,15 +165730,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -104954,35 +165751,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 668 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104990,16 +165787,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105007,57 +165804,65 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105067,7 +165872,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105094,15 +165900,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -105113,37 +165921,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 669 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105151,15 +165959,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -105168,59 +165976,67 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 32 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 32 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105230,7 +166046,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105257,15 +166074,17 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true @@ -105276,35 +166095,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 670 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105312,16 +166131,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105332,56 +166151,64 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105391,7 +166218,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105418,27 +166246,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 671 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105447,25 +166279,25 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105473,16 +166305,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -105497,50 +166329,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 864 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105550,8 +166392,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -105577,34 +166420,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 672 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -105617,16 +166464,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105640,9 +166485,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -105658,52 +166503,60 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSPA: 16 + LSPB: 64 + LVCA: 16 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105713,7 +166566,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105740,34 +166594,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 673 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -105780,14 +166638,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105801,39 +166659,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105845,24 +166708,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105872,7 +166738,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105899,47 +166766,51 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 674 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -105948,7 +166819,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105956,13 +166827,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -105976,25 +166847,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106005,7 +166881,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -106013,17 +166889,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106033,7 +166912,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106060,56 +166940,60 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 675 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106123,8 +167007,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106137,25 +167021,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106168,23 +167057,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106194,7 +167088,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106221,52 +167116,54 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 676 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106284,8 +167181,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106298,7 +167195,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -106313,15 +167210,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3200 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106333,24 +167226,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106361,7 +167259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106388,27 +167286,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 677 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106417,23 +167319,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 32 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106452,20 +167352,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -106473,22 +167373,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106501,25 +167401,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106557,27 +167458,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 678 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106586,21 +167491,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106619,7 +167526,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106647,15 +167554,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106668,25 +167575,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106698,7 +167606,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -106724,27 +167632,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 679 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106753,11 +167665,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -106765,9 +167677,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106786,20 +167700,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -106807,33 +167721,33 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -106841,17 +167755,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106889,27 +167808,31 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 680 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106928,13 +167851,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106953,16 +167874,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -106974,51 +167895,56 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 8 - LVCB: 4 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107056,34 +167982,38 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: true TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 681 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -107096,12 +168026,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [12, 896.219] @@ -109357,24 +170285,10 @@ - [95, 6513.35] - - [4288, 1024, 1, 128] - [80, 4291.67] - - - [512, 2048, 1, 49] - - [126, 4554.98] - - - [512, 128, 1, 784] - - [119, 3195.29] - - - [2048, 512, 1, 49] - - [127, 4253.33] - - - [1024, 256, 1, 196] - - [123, 4039.33] - - [256, 64, 1, 3136] - [121, 3015.27] - - [256, 1024, 1, 196] - [125, 4225.35] - - - [64, 256, 1, 3136] - - [122, 3058.35] - - - [128, 512, 1, 784] - - [120, 3380.28] - - - [64, 64, 1, 3136] - - [124, 1372.34] - - [1024, 1024, 1, 3328] - [237, 8705.0] - - [2048, 200, 1, 3200] @@ -110025,8 +170939,6 @@ - [231, 6307.6] - - [1024, 512, 1, 4608] - [242, 7953.38] - - - [2048, 256, 1, 768] - - [242, 7059.14] - - [4096, 200, 1, 32] - [191, 2199.19] - - [4096, 200, 1, 3328] @@ -110209,8 +171121,6 @@ - [231, 5745.62] - - [1024, 200, 1, 1280] - [223, 4446.13] - - - [4096, 512, 1, 4096] - - [141, 9264.39] - - [2048, 256, 1, 3200] - [231, 7842.75] - - [2048, 512, 1, 15360] @@ -110711,64 +171621,28 @@ - [237, 6628.17] - - [4096, 1024, 1, 6144] - [139, 9592.98] - - - [1280, 384, 1, 64] - - [270, 3196.88] - - [256, 64, 1, 1225] - [271, 1194.67] - - [2048, 320, 1, 64] - [273, 3449.26] - - - [256, 48, 1, 1225] - - [264, 913.398] - - - [2048, 192, 1, 64] - - [263, 2516.58] - - [1024, 128, 1, 289] - [277, 2869.68] - - - [1280, 192, 1, 64] - - [256, 1872.46] - - - [192, 32, 1, 1225] - - [261, 505.806] - - - [1280, 448, 1, 64] - - [257, 3078.87] - - [384, 64, 1, 1225] - [262, 1511.33] - - [2048, 384, 1, 64] - [275, 3836.25] - - - [288, 48, 1, 1225] - - [258, 1032.59] - - [64, 80, 1, 5329] - [274, 888.167] - - [1024, 384, 1, 289] - [268, 4291.52] - - [2048, 448, 1, 64] - [267, 3783.52] - - - [1280, 320, 1, 64] - - [273, 2776.95] - - - [192, 64, 1, 1225] - - [258, 926.897] - - - [384, 192, 1, 1225] - - [269, 2560.0] - - - [1536, 256, 1, 64] - - [276, 2621.44] - - - [192, 48, 1, 1225] - - [261, 698.614] - - - [768, 128, 1, 289] - - [278, 2291.12] - - - [1024, 256, 1, 289] - - [276, 4064.36] - - [768, 192, 1, 289] - [272, 2690.33] - - - [1536, 384, 1, 64] - - [259, 3145.73] - - [288, 64, 1, 1225] - [261, 1142.67] - - - [1024, 192, 1, 289] - - [266, 3243.13] - - [384, 96, 1, 1225] - [279, 1844.71] - - - [160, 64, 1, 5329] - - [265, 1564.48] - - - [768, 160, 1, 289] - - [260, 2386.58] - - [1024, 3392, 1, 4096] - [305, 8502.92] - - [1024, 3301, 1, 4096] @@ -112697,5598 +173571,6596 @@ - [336, 6145.5] - - [1024, 3712, 1, 1024] - [338, 8933.88] + - - [256, 256, 192, 64] + - [343, 8264.64] + - - [768, 4096, 1, 768] + - [356, 9642.08] + - - [768, 64, 1, 768] + - [353, 1850.43] + - - [768, 1280, 1, 768] + - [356, 8738.13] + - - [30522, 320, 1, 768] + - [357, 9733.59] + - - [128, 128, 96, 64] + - [346, 5470.83] + - - [2, 16, 1, 768] + - [349, 2.47742] + - - [30522, 1280, 1, 768] + - [355, 10127.9] + - - [30522, 640, 1, 768] + - [356, 9987.61] + - - [2, 8, 1, 768] + - [348, 0.96] + - - [768, 4096, 1, 3072] + - [358, 9479.41] + - - [768, 32, 1, 768] + - [352, 880.334] + - - [2, 64, 1, 768] + - [349, 9.99024] + - - [256, 256, 96, 64] + - [343, 7614.47] + - - [64, 64, 768, 64] + - [345, 5354.43] + - - [30522, 160, 1, 768] + - [354, 7740.11] + - - [768, 320, 1, 768] + - [347, 5423.67] + - - [128, 128, 384, 64] + - [344, 7179.98] + - - [768, 16, 1, 768] + - [350, 706.376] + - - [3072, 4096, 1, 768] + - [359, 9961.74] + - - [2048, 512, 1, 100] + - [361, 5180.71] + - - [1024, 200, 1, 560] + - [362, 4061.19] + - - [256, 1280, 1, 1024] + - [369, 4337.44] + - - [256, 44505, 1, 1024] + - [405, 8597.69] + - - [10240, 8976, 1, 256] + - [408, 9471.43] + - - [256, 7168, 1, 1024] + - [399, 6718.56] + - - [8448, 8976, 1, 256] + - [391, 9601.31] + - - [18944, 8976, 1, 256] + - [400, 9666.26] + - - [256, 19200, 1, 1024] + - [376, 7488.94] + - - [5632, 8976, 1, 256] + - [388, 9358.39] + - - [256, 23552, 1, 1024] + - [403, 7980.89] + - - [256, 6656, 1, 1024] + - [403, 6287.22] + - - [256, 14336, 1, 1024] + - [398, 7049.26] + - - [256, 12544, 1, 1024] + - [376, 6728.47] + - - [2048, 684, 1, 768] + - [393, 8479.18] + - - [5376, 8976, 1, 256] + - [388, 9519.51] + - - [256, 5888, 1, 1024] + - [408, 6012.4] + - - [19968, 8976, 1, 256] + - [400, 9684.67] + - - [3840, 8976, 1, 256] + - [385, 9461.89] + - - [4608, 8976, 1, 256] + - [385, 9305.82] + - - [256, 684, 1, 1024] + - [411, 3513.06] + - - [256, 22016, 1, 1024] + - [376, 7643.79] + - - [256, 23296, 1, 1024] + - [405, 8048.12] + - - [4864, 8976, 1, 256] + - [383, 9545.62] + - - [256, 7424, 1, 1024] + - [401, 6770.65] + - - [18176, 8976, 1, 256] + - [408, 9729.47] + - - [256, 15104, 1, 1024] + - [397, 7289.08] + - - [8192, 8976, 1, 256] + - [400, 9395.49] + - - [256, 16128, 1, 1024] + - [400, 7461.28] + - - [13312, 8976, 1, 256] + - [408, 9550.97] + - - [256, 21504, 1, 1024] + - [405, 7635.93] + - - [6400, 8976, 1, 256] + - [392, 9560.96] + - - [256, 8960, 1, 1024] + - [367, 6292.36] + - - [1792, 8976, 1, 256] + - [382, 9372.18] + - - [13824, 8976, 1, 256] + - [400, 9585.27] + - - [11776, 8976, 1, 256] + - [400, 9560.34] + - - [256, 20992, 1, 1024] + - [398, 7490.65] + - - [20480, 8976, 1, 256] + - [408, 9610.7] + - - [5888, 8976, 1, 256] + - [379, 9565.2] + - - [256, 10496, 1, 1024] + - [370, 6631.96] + - - [21248, 8976, 1, 256] + - [400, 9755.77] + - - [5120, 8976, 1, 256] + - [408, 9244.59] + - - [7168, 8976, 1, 256] + - [400, 9388.42] + - - [2048, 1536, 1, 768] + - [389, 9446.04] + - - [256, 8192, 1, 1024] + - [394, 6948.89] + - - [4096, 8976, 1, 256] + - [399, 9115.94] + - - [3328, 8976, 1, 256] + - [392, 9434.55] + - - [1280, 8976, 1, 256] + - [390, 9129.8] + - - [2560, 8976, 1, 256] + - [387, 9199.48] + - - [3072, 8976, 1, 256] + - [402, 8963.6] + - - [256, 11776, 1, 1024] + - [380, 6869.8] + - - [18688, 8976, 1, 256] + - [408, 9726.21] + - - [15104, 8976, 1, 256] + - [408, 9715.71] + - - [23552, 8976, 1, 256] + - [400, 9648.42] + - - [6144, 8976, 1, 256] + - [408, 9339.8] + - - [12544, 8976, 1, 256] + - [408, 9654.45] + - - [256, 11264, 1, 1024] + - [381, 6814.98] + - - [2048, 114, 1, 512] + - [412, 4583.5] + - - [4352, 8976, 1, 256] + - [392, 9471.4] + - - [15360, 8976, 1, 256] + - [408, 9583.77] + - - [256, 31488, 1, 1024] + - [407, 8438.01] + - - [28672, 8976, 1, 256] + - [400, 9688.85] + - - [256, 18176, 1, 1024] + - [376, 7405.09] + - - [9728, 8976, 1, 256] + - [408, 9524.15] + - - [256, 2816, 1, 1024] + - [372, 5405.66] + - - [256, 18944, 1, 1024] + - [376, 7503.41] + - - [256, 3584, 1, 1024] + - [375, 6107.15] + - - [7936, 8976, 1, 256] + - [388, 9608.31] + - - [19712, 8976, 1, 256] + - [408, 9736.25] + - - [256, 14848, 1, 1024] + - [381, 7163.42] + - - [256, 8448, 1, 1024] + - [381, 6372.56] + - - [256, 6400, 1, 1024] + - [395, 6395.71] + - - [256, 6144, 1, 1024] + - [406, 6490.22] + - - [9472, 8976, 1, 256] + - [385, 9609.92] + - - [256, 9984, 1, 1024] + - [368, 6484.75] + - - [684, 8976, 1, 256] + - [377, 8128.53] + - - [20992, 8976, 1, 256] + - [400, 9689.65] + - - [2048, 684, 1, 512] + - [384, 7241.78] + - - [2048, 114, 1, 768] + - [410, 4872.46] + - - [8960, 8976, 1, 256] + - [383, 9603.35] + - - [2048, 1536, 1, 512] + - [386, 8830.11] + - - [256, 3328, 1, 1024] + - [374, 5612.55] + - - [33536, 8976, 1, 256] + - [400, 9797.71] + - - [2048, 8976, 1, 256] + - [400, 8975.46] + - - [10496, 8976, 1, 256] + - [391, 9654.43] + - - [256, 5376, 1, 1024] + - [409, 5626.34] + - - [256, 21248, 1, 1024] + - [378, 7525.45] + - - [256, 13312, 1, 1024] + - [376, 6767.11] + - - [16128, 8976, 1, 256] + - [400, 9715.57] + - - [2304, 8976, 1, 256] + - [373, 9433.83] + - - [256, 4864, 1, 1024] + - [363, 5743.55] + - - [17152, 8976, 1, 256] + - [408, 9708.94] + - - [15872, 8976, 1, 256] + - [408, 9657.57] + - - [9984, 8976, 1, 256] + - [385, 9639.74] + - - [256, 14592, 1, 1024] + - [397, 7223.92] + - - [256, 33536, 1, 1024] + - [404, 8147.31] + - - [11264, 8976, 1, 256] + - [400, 9509.96] + - - [31488, 8976, 1, 256] + - [408, 9799.31] + - - [256, 20480, 1, 1024] + - [381, 7498.2] + - - [44505, 8976, 1, 256] + - [392, 9804.78] + - - [13568, 8976, 1, 256] + - [400, 9680.24] + - - [256, 11520, 1, 1024] + - [380, 6805.26] + - - [256, 7936, 1, 1024] + - [396, 6971.77] + - - [2048, 256, 1, 768] + - [366, 7129.13] + - - [256, 4608, 1, 1024] + - [364, 5462.91] + - - [256, 2304, 1, 1024] + - [371, 4842.69] + - - [256, 2560, 1, 1024] + - [372, 5309.25] + - - [2816, 8976, 1, 256] + - [383, 9409.56] + - - [1728, 320, 1, 64] + - [419, 3205.57] + - - [1152, 128, 1, 784] + - [466, 3498.96] + - - [576, 96, 1, 5329] + - [452, 3947.92] + - - [864, 96, 1, 1225] + - [473, 3009.67] + - - [256, 128, 1, 784] + - [463, 1536.49] + - - [1440, 320, 1, 196] + - [416, 4824.62] + - - [192, 48, 1, 1225] + - [494, 820.465] + - - [2592, 384, 1, 289] + - [434, 7353.01] + - - [192, 80, 36, 10368] + - [484, 5360.04] + - - [896, 192, 1, 289] + - [451, 3076.56] + - - [768, 128, 1, 289] + - [476, 2351.81] + - - [64, 256, 1, 3136] + - [502, 1809.16] + - - [1280, 384, 1, 64] + - [416, 3171.1] + - - [512, 144, 1, 196] + - [474, 1445.07] + - - [1344, 192, 1, 289] + - [457, 4376.52] + - - [288, 64, 1, 21609] + - [468, 3396.12] + - - [400, 32, 1, 784] + - [495, 922.353] + - - [288, 32, 1, 21609] + - [506, 2816.01] + - - [1280, 448, 1, 64] + - [419, 3253.56] + - - [3456, 256, 1, 169] + - [431, 5822.44] + - - [2304, 256, 1, 196] + - [429, 4931.98] + - - [384, 192, 1, 1225] + - [477, 2720.39] + - - [832, 48, 1, 49] + - [472, 344.518] + - - [832, 192, 1, 49] + - [454, 1099.36] + - - [1280, 192, 1, 64] + - [455, 2069.56] + - - [192, 32, 1, 784] + - [494, 459.627] + - - [288, 48, 1, 1225] + - [501, 1176.0] + - - [512, 112, 1, 196] + - [469, 1277.21] + - - [224, 192, 36, 2592] + - [486, 7369.56] + - - [528, 32, 1, 196] + - [460, 440.374] + - - [192, 128, 36, 1568] + - [485, 8245.76] + - - [4032, 384, 1, 64] + - [430, 5898.24] + - - [576, 64, 1, 3136] + - [475, 2671.11] + - - [2048, 32, 1, 1001] + - [477, 2323.0] + - - [480, 64, 1, 196] + - [462, 752.64] + - - [512, 256, 1, 196] + - [464, 2528.55] + - - [864, 96, 1, 289] + - [474, 1958.4] + - - [896, 128, 1, 289] + - [477, 2725.73] + - - [192, 64, 1, 784] + - [492, 898.675] + - - [1200, 64, 1, 1225] + - [476, 2780.14] + - - [1296, 288, 1, 196] + - [415, 3826.18] + - - [576, 96, 1, 5041] + - [456, 3795.58] + - - [1024, 256, 1, 289] + - [445, 4488.13] + - - [1024, 2048, 1, 49] + - [435, 5077.1] + - - [192, 64, 36, 6272] + - [479, 7514.98] + - - [4096, 512, 1, 4096] + - [441, 10276.0] + - - [192, 32, 1, 1225] + - [495, 556.686] + - - [1024, 256, 1, 196] + - [455, 3892.44] + - - [1120, 192, 1, 289] + - [444, 3752.81] + - - [400, 48, 1, 196] + - [469, 480.0] + - - [1728, 224, 1, 1225] + - [422, 5575.77] + - - [800, 96, 1, 784] + - [476, 2668.94] + - - [1152, 384, 1, 64] + - [426, 3077.34] + - - [4608, 512, 1, 49] + - [433, 4676.6] + - - [1792, 256, 1, 289] + - [426, 5345.94] + - - [864, 128, 1, 784] + - [476, 3816.2] + - - [1728, 384, 1, 169] + - [428, 5191.68] + - - [480, 16, 1, 196] + - [497, 241.231] + - - [1568, 256, 1, 289] + - [416, 4723.41] + - - [1152, 448, 1, 64] + - [422, 3356.72] + - - [512, 64, 1, 196] + - [461, 802.816] + - - [1344, 224, 1, 289] + - [416, 3519.63] + - - [9216, 512, 1, 4096] + - [439, 9146.02] + - - [27, 32, 1, 22201] + - [507, 264.356] + - - [1152, 192, 1, 784] + - [446, 4904.08] + - - [1536, 256, 1, 64] + - [414, 2578.47] + - - [800, 128, 1, 196] + - [476, 1991.11] + - - [800, 64, 1, 196] + - [471, 1150.83] + - - [864, 208, 1, 196] + - [448, 2684.72] + - - [1440, 320, 1, 49] + - [417, 2313.44] + - - [512, 128, 1, 784] + - [467, 2780.32] + - - [720, 192, 1, 5041] + - [442, 5410.46] + - - [256, 64, 1, 784] + - [499, 1163.5] + - - [256, 48, 1, 1225] + - [494, 1075.2] + - - [576, 192, 1, 3136] + - [442, 4833.01] + - - [160, 64, 1, 5329] + - [496, 1753.5] + - - [3456, 384, 1, 289] + - [436, 7341.75] + - - [32, 32, 36, 43808] + - [490, 1378.03] + - - [1344, 512, 1, 64] + - [415, 3822.93] + - - [192, 16, 1, 784] + - [495, 228.073] + - - [3456, 384, 1, 169] + - [432, 6675.02] + - - [1152, 256, 1, 196] + - [425, 3211.26] + - - [1728, 192, 1, 1225] + - [426, 4852.26] + - - [2048, 512, 1, 49] + - [438, 3471.64] + - - [576, 96, 1, 1225] + - [469, 2176.66] + - - [512, 2048, 1, 49] + - [420, 3845.83] + - - [1728, 192, 1, 64] + - [415, 2369.83] + - - [832, 256, 1, 49] + - [445, 1433.6] + - - [512, 128, 1, 196] + - [470, 1459.67] + - - [1200, 128, 1, 49] + - [465, 1069.09] + - - [528, 256, 1, 196] + - [453, 2069.76] + - - [256, 512, 1, 784] + - [476, 4538.89] + - - [480, 192, 1, 196] + - [476, 1792.0] + - - [96, 64, 36, 2592] + - [483, 4845.41] + - - [96, 96, 36, 2592] + - [488, 5111.53] + - - [1024, 192, 1, 289] + - [450, 3431.14] + - - [1536, 384, 1, 64] + - [421, 3166.84] + - - [192, 96, 1, 784] + - [461, 881.14] + - - [2048, 192, 1, 64] + - [418, 2330.17] + - - [192, 64, 1, 1225] + - [500, 1100.35] + - - [512, 32, 1, 196] + - [491, 477.867] + - - [128, 96, 36, 1568] + - [487, 6649.09] + - - [528, 128, 1, 196] + - [473, 1403.23] + - - [128, 512, 1, 784] + - [463, 2237.81] + - - [128, 128, 36, 3136] + - [480, 6538.77] + - - [528, 160, 1, 196] + - [477, 1642.67] + - - [448, 64, 1, 5329] + - [452, 3264.81] + - - [1280, 320, 1, 64] + - [416, 2776.95] + - - [1792, 320, 1, 289] + - [428, 5204.9] + - - [2880, 320, 1, 64] + - [424, 4336.94] + - - [147, 64, 1, 12544] + - [505, 2430.27] + - - [4096, 512, 1, 1001] + - [440, 9618.99] + - - [1536, 32, 1, 1001] + - [477, 1757.18] + - - [512, 160, 1, 196] + - [473, 1592.89] + - - [768, 160, 1, 289] + - [474, 2757.17] + - - [1728, 384, 1, 49] + - [426, 3102.49] + - - [64, 32, 36, 43808] + - [481, 2626.43] + - - [64, 64, 1, 3136] + - [493, 610.506] + - - [256, 32, 1, 784] + - [494, 612.837] + - - [480, 96, 1, 196] + - [469, 1055.1] + - - [1024, 32, 1, 1001] + - [459, 1188.43] + - - [832, 160, 1, 49] + - [474, 959.247] + - - [512, 1024, 1, 196] + - [417, 4978.7] + - - [96, 64, 36, 10368] + - [511, 5000.95] + - - [384, 448, 36, 512] + - [516, 8903.0] + - - [2048, 64, 1, 1001] + - [509, 4385.13] + - - [224, 192, 36, 5184] + - [515, 7487.81] + - - [2048, 128, 1, 1001] + - [508, 5764.63] + - - [96, 96, 36, 10368] + - [517, 5275.21] + - - [192, 80, 36, 20736] + - [513, 5409.4] + - - [96, 64, 36, 5184] + - [511, 4911.83] + - - [1536, 64, 1, 1001] + - [510, 3162.03] + - - [96, 64, 36, 20736] + - [512, 5034.33] + - - [384, 448, 36, 256] + - [514, 8815.87] + - - [96, 96, 36, 5184] + - [518, 5236.02] - - [1024, 128, 1, 128] - - [353, 896.319] + - [531, 896.319] - - [4, 704, 1, 1280] - - [390, 328.976] + - [568, 328.976] - - [4, 1856, 1, 3328] - - [400, 501.461] + - [578, 501.461] - - [1856, 448, 1, 3328] - - [445, 5678.01] + - [623, 5678.01] - - [2944, 4288, 1, 1280] - - [431, 8412.49] + - [609, 8412.49] - - [2368, 64, 1, 3328] - - [381, 4914.02] + - [559, 4914.02] - - [1760, 32, 1, 1760] - - [408, 3313.04] + - [586, 3313.04] - - [2368, 5888, 1, 256] - - [431, 6489.82] + - [609, 6489.82] - - [5888, 1856, 1, 256] - - [443, 7791.98] + - [621, 7791.98] - - [128, 64, 1, 256] - - [415, 369.317] + - [593, 369.317] - - [512, 24000, 1, 1536] - - [437, 8827.47] + - [615, 8827.47] - - [128, 6784, 1, 3328] - - [437, 6537.09] + - [615, 6537.09] - - [5888, 1408, 1, 256] - - [451, 6129.71] + - [629, 6129.71] - - [5888, 1856, 1, 3328] - - [437, 7969.27] + - [615, 7969.27] - - [5056, 704, 1, 256] - - [437, 6723.92] + - [615, 6723.92] - - [2048, 400, 1, 512] - - [443, 4531.54] + - [621, 4531.54] - - [5888, 2944, 1, 3328] - - [443, 8608.14] + - [621, 8608.14] - - [1856, 4288, 1, 256] - - [443, 6297.64] + - [621, 6297.64] - - [1024, 5056, 1, 128] - - [421, 3595.47] + - [599, 3595.47] - - [5056, 5056, 1, 3328] - - [437, 8559.26] + - [615, 8559.26] - - [1408, 5888, 1, 1280] - - [432, 6797.16] + - [610, 6797.16] - - [2368, 448, 1, 128] - - [421, 2815.0] + - [599, 2815.0] - - [2368, 6784, 1, 128] - - [425, 4782.08] + - [603, 4782.08] - - [1024, 3584, 1, 3328] - - [433, 8402.54] + - [611, 8402.54] - - [512, 48000, 1, 2048] - - [437, 8162.33] + - [615, 8162.33] - - [128, 448, 1, 1280] - - [408, 2903.59] + - [586, 2903.59] - - [256, 4288, 1, 3328] - - [438, 6346.04] + - [616, 6346.04] - - [5888, 1408, 1, 1280] - - [437, 8959.55] + - [615, 8959.55] - - [704, 1856, 1, 3328] - - [432, 6955.37] + - [610, 6955.37] - - [4, 1408, 1, 128] - - [452, 60.1747] + - [630, 60.1747] - - [1024, 2368, 1, 256] - - [439, 5927.88] + - [617, 5927.88] - - [64, 4, 1, 256] - - [457, 13.3129] + - [635, 13.3129] - - [1408, 1856, 1, 1280] - - [435, 8051.68] + - [613, 8051.68] - - [1408, 64, 1, 1280] - - [411, 3400.55] + - [589, 3400.55] - - [448, 1024, 1, 1280] - - [439, 5730.02] + - [617, 5730.02] - - [6144, 24000, 1, 2048] - - [443, 7738.4] + - [621, 7738.4] - - [4096, 32, 1, 4096] - - [381, 2381.53] + - [559, 2381.53] - - [256, 1408, 1, 3328] - - [439, 4844.88] + - [617, 4844.88] - - [5056, 5056, 1, 1280] - - [443, 9090.2] + - [621, 9090.2] - - [448, 5056, 1, 256] - - [449, 4961.28] + - [627, 4961.28] - - [704, 1856, 1, 1280] - - [435, 6456.54] + - [613, 6456.54] - - [128, 5056, 1, 128] - - [364, 2251.12] + - [542, 2251.12] - - [2368, 128, 1, 256] - - [432, 3403.37] + - [610, 3403.37] - - [1760, 6400, 1, 1760] - - [431, 8959.8] + - [609, 8959.8] - - [1856, 1408, 1, 128] - - [424, 3493.16] + - [602, 3493.16] - - [64, 5056, 1, 256] - - [433, 2582.32] + - [611, 2582.32] - - [6784, 256, 1, 3328] - - [431, 7323.64] + - [609, 7323.64] - - [6784, 4288, 1, 3328] - - [433, 8542.19] + - [611, 8542.19] - - [4288, 448, 1, 256] - - [449, 5030.6] + - [627, 5030.6] - - [64, 704, 1, 128] - - [366, 375.567] + - [544, 375.567] - - [1856, 2368, 1, 3328] - - [442, 6742.44] + - [620, 6742.44] - - [4288, 2944, 1, 1280] - - [443, 8578.27] + - [621, 8578.27] - - [704, 5056, 1, 1280] - - [439, 8014.55] + - [617, 8014.55] - - [2368, 704, 1, 3328] - - [438, 6544.41] + - [616, 6544.41] - - [256, 5888, 1, 256] - - [436, 5933.0] + - [614, 5933.0] - - [1856, 4288, 1, 3328] - - [442, 7410.82] + - [620, 7410.82] - - [256, 2944, 1, 256] - - [438, 5014.08] + - [616, 5014.08] - - [5888, 1024, 1, 256] - - [443, 8069.44] + - [621, 8069.44] - - [448, 64, 1, 1280] - - [418, 2057.28] + - [596, 2057.28] - - [3072, 64, 1, 1024] - - [398, 2145.52] + - [576, 2145.52] - - [3584, 4, 1, 1280] - - [390, 498.743] + - [568, 498.743] - - [16384, 3200, 1, 4096] - - [430, 6621.53] + - [608, 6621.53] - - [2944, 64, 1, 256] - - [438, 2554.89] + - [616, 2554.89] - - [128, 4, 1, 1280] - - [400, 87.2489] + - [578, 87.2489] - - [1408, 2944, 1, 256] - - [437, 8029.45] + - [615, 8029.45] - - [256, 1856, 1, 1280] - - [432, 6170.7] + - [610, 6170.7] - - [6784, 5056, 1, 3328] - - [441, 7134.29] + - [619, 7134.29] - - [5056, 5056, 1, 256] - - [449, 6246.9] + - [627, 6246.9] - - [1408, 6784, 1, 128] - - [426, 4329.55] + - [604, 4329.55] - - [64, 1024, 1, 1280] - - [408, 3206.75] + - [586, 3206.75] - - [2944, 4, 1, 256] - - [457, 333.58] + - [635, 333.58] - - [704, 5056, 1, 128] - - [421, 4085.52] + - [599, 4085.52] - - [4, 2368, 1, 1280] - - [458, 394.767] + - [636, 394.767] - - [2368, 2944, 1, 1280] - - [437, 8634.05] + - [615, 8634.05] - - [128, 3584, 1, 1280] - - [438, 6046.25] + - [616, 6046.25] - - [6784, 6784, 1, 1280] - - [443, 8847.51] + - [621, 8847.51] - - [1408, 4288, 1, 1280] - - [443, 8236.79] + - [621, 8236.79] - - [3584, 4288, 1, 1280] - - [438, 7399.98] + - [616, 7399.98] - - [2368, 704, 1, 1280] - - [431, 6754.5] + - [609, 6754.5] - - [5056, 4288, 1, 3328] - - [437, 8569.63] + - [615, 8569.63] - - [3584, 2368, 1, 3328] - - [442, 7942.48] + - [620, 7942.48] - - [64, 704, 1, 1280] - - [411, 2363.69] + - [589, 2363.69] - - [4288, 256, 1, 256] - - [439, 4591.9] + - [617, 4591.9] - - [2944, 128, 1, 128] - - [364, 1878.39] + - [542, 1878.39] - - [6144, 32, 1, 2560] - - [409, 3334.2] + - [587, 3334.2] - - [6784, 448, 1, 1280] - - [441, 7939.3] + - [619, 7939.3] - - [1408, 2944, 1, 128] - - [425, 4096.61] + - [603, 4096.61] - - [4288, 2944, 1, 256] - - [431, 8141.23] + - [609, 8141.23] - - [5888, 704, 1, 1280] - - [432, 7516.23] + - [610, 7516.23] - - [5056, 4, 1, 3328] - - [375, 552.509] + - [553, 552.509] - - [1856, 64, 1, 1280] - - [381, 3870.86] + - [559, 3870.86] - - [1760, 16, 1, 1760] - - [393, 2181.51] + - [571, 2181.51] - - [448, 5888, 1, 128] - - [426, 3371.1] + - [604, 3371.1] - - [5888, 64, 1, 3328] - - [406, 5319.48] + - [584, 5319.48] - - [2944, 256, 1, 3328] - - [438, 7122.4] + - [616, 7122.4] - - [1024, 64, 1, 128] - - [353, 595.882] + - [531, 595.882] - - [5056, 2368, 1, 1280] - - [432, 7778.29] + - [610, 7778.29] - - [448, 3584, 1, 1280] - - [437, 6500.62] + - [615, 6500.62] - - [6784, 5888, 1, 256] - - [437, 8918.68] + - [615, 8918.68] - - [704, 1024, 1, 128] - - [421, 2627.51] + - [599, 2627.51] - - [704, 128, 1, 1280] - - [408, 3408.59] + - [586, 3408.59] - - [4, 3584, 1, 128] - - [452, 140.821] + - [630, 140.821] - - [1408, 448, 1, 1280] - - [432, 5881.54] + - [610, 5881.54] - - [1024, 1408, 1, 256] - - [436, 5647.27] + - [614, 5647.27] - - [2368, 2368, 1, 3328] - - [430, 7688.83] + - [608, 7688.83] - - [1856, 6784, 1, 128] - - [421, 4705.95] + - [599, 4705.95] - - [5056, 704, 1, 3328] - - [441, 8198.98] + - [619, 8198.98] - - [1408, 1856, 1, 256] - - [443, 6340.05] + - [621, 6340.05] - - [1408, 704, 1, 3328] - - [435, 7599.65] + - [613, 7599.65] - - [2368, 5056, 1, 256] - - [443, 8242.85] + - [621, 8242.85] - - [1408, 256, 1, 1280] - - [438, 4879.26] + - [616, 4879.26] - - [3072, 128, 1, 1024] - - [407, 2525.52] + - [585, 2525.52] - - [3584, 2368, 1, 1280] - - [439, 8132.72] + - [617, 8132.72] - - [4288, 64, 1, 3328] - - [394, 5156.53] + - [572, 5156.53] - - [2368, 4, 1, 1280] - - [456, 482.75] + - [634, 482.75] - - [704, 5888, 1, 256] - - [446, 5398.75] + - [624, 5398.75] - - [6784, 2944, 1, 128] - - [422, 4748.99] + - [600, 4748.99] - - [2560, 1600, 1, 2560] - - [433, 7355.0] + - [611, 7355.0] - - [4288, 6784, 1, 3328] - - [430, 7409.41] + - [608, 7409.41] - - [2944, 256, 1, 256] - - [438, 5077.42] + - [616, 5077.42] - - [2944, 6784, 1, 3328] - - [443, 8068.05] + - [621, 8068.05] - - [704, 1408, 1, 3328] - - [438, 7239.43] + - [616, 7239.43] - - [6144, 5984, 1, 2048] - - [437, 7176.07] + - [615, 7176.07] - - [3584, 704, 1, 3328] - - [443, 6642.86] + - [621, 6642.86] - - [2944, 256, 1, 128] - - [422, 2644.54] + - [600, 2644.54] - - [6784, 4, 1, 1280] - - [454, 402.487] + - [632, 402.487] - - [1024, 64, 1, 1280] - - [408, 2602.03] + - [586, 2602.03] - - [2048, 1600, 1, 512] - - [435, 5592.5] + - [613, 5592.5] - - [448, 4288, 1, 256] - - [433, 6128.99] + - [611, 6128.99] - - [64, 3584, 1, 3328] - - [374, 5534.93] + - [552, 5534.93] - - [1856, 4288, 1, 128] - - [424, 4400.11] + - [602, 4400.11] - - [704, 2368, 1, 1280] - - [449, 5735.02] + - [627, 5735.02] - - [1856, 2368, 1, 1280] - - [446, 6482.4] + - [624, 6482.4] - - [2368, 128, 1, 3328] - - [419, 4717.32] + - [597, 4717.32] - - [2944, 128, 1, 256] - - [446, 3276.9] + - [624, 3276.9] - - [448, 1408, 1, 256] - - [438, 4852.28] + - [616, 4852.28] - - [1856, 4288, 1, 1280] - - [433, 8132.96] + - [611, 8132.96] - - [64, 5056, 1, 3328] - - [409, 5097.06] + - [587, 5097.06] - - [4, 704, 1, 256] - - [456, 128.831] + - [634, 128.831] - - [1024, 448, 1, 128] - - [421, 1816.94] + - [599, 1816.94] - - [704, 4, 1, 1280] - - [457, 328.976] + - [635, 328.976] - - [704, 256, 1, 128] - - [425, 876.569] + - [603, 876.569] - - [704, 2944, 1, 128] - - [425, 3734.47] + - [603, 3734.47] - - [1408, 1024, 1, 1280] - - [433, 7224.85] + - [611, 7224.85] - - [704, 6784, 1, 256] - - [437, 7354.77] + - [615, 7354.77] - - [6784, 704, 1, 256] - - [433, 6012.28] + - [611, 6012.28] - - [5056, 1408, 1, 128] - - [426, 4311.28] + - [604, 4311.28] - - [2048, 7000, 1, 2048] - - [437, 7232.07] + - [615, 7232.07] - - [256, 3584, 1, 3328] - - [441, 7006.0] + - [619, 7006.0] - - [4, 5888, 1, 3328] - - [459, 534.612] + - [637, 534.612] - - [128, 1408, 1, 128] - - [351, 1177.07] + - [529, 1177.07] - - [3584, 4288, 1, 3328] - - [443, 7135.0] + - [621, 7135.0] - - [5888, 1856, 1, 1280] - - [431, 8395.03] + - [609, 8395.03] - - [256, 1408, 1, 256] - - [432, 3977.46] + - [610, 3977.46] - - [5056, 64, 1, 1280] - - [432, 4257.78] + - [610, 4257.78] - - [1024, 704, 1, 256] - - [432, 5036.93] + - [610, 5036.93] - - [448, 128, 1, 128] - - [353, 533.533] + - [531, 533.533] - - [2368, 3584, 1, 1280] - - [437, 8272.43] + - [615, 8272.43] - - [2368, 6784, 1, 1280] - - [430, 8288.24] + - [608, 8288.24] - - [1856, 4, 1, 1280] - - [370, 464.1] + - [548, 464.1] - - [448, 448, 1, 256] - - [432, 3058.45] + - [610, 3058.45] - - [2944, 3584, 1, 3328] - - [437, 8557.63] + - [615, 8557.63] - - [7680, 32, 1, 2560] - - [409, 3729.03] + - [587, 3729.03] - - [128, 4288, 1, 128] - - [352, 2116.2] + - [530, 2116.2] - - [256, 256, 1, 3328] - - [408, 4051.06] + - [586, 4051.06] - - [128, 1024, 1, 3328] - - [381, 5139.21] + - [559, 5139.21] - - [4, 1408, 1, 3328] - - [400, 502.871] + - [578, 502.871] - - [6784, 2944, 1, 256] - - [431, 8446.06] + - [609, 8446.06] - - [64, 1856, 1, 1280] - - [373, 3870.86] + - [551, 3870.86] - - [6784, 64, 1, 128] - - [421, 1877.62] + - [599, 1877.62] - - [4288, 2368, 1, 3328] - - [441, 8419.4] + - [619, 8419.4] - - [1856, 2368, 1, 256] - - [435, 6887.48] + - [613, 6887.48] - - [3584, 256, 1, 128] - - [425, 2496.71] + - [603, 2496.71] - - [3584, 6784, 1, 3328] - - [437, 7626.18] + - [615, 7626.18] - - [256, 1024, 1, 256] - - [438, 3095.53] + - [616, 3095.53] - - [4, 6784, 1, 3328] - - [400, 589.274] + - [578, 589.274] - - [1024, 5888, 1, 3328] - - [437, 7794.35] + - [615, 7794.35] - - [1024, 128, 1, 1280] - - [410, 3130.18] + - [588, 3130.18] - - [3072, 32, 1, 1024] - - [397, 1675.59] + - [575, 1675.59] - - [6144, 24000, 1, 2560] - - [437, 7256.14] + - [615, 7256.14] - - [5056, 4288, 1, 1280] - - [435, 8349.03] + - [613, 8349.03] - - [5888, 64, 1, 256] - - [384, 2593.35] + - [562, 2593.35] - - [6784, 1856, 1, 3328] - - [431, 8087.38] + - [609, 8087.38] - - [1408, 5056, 1, 1280] - - [433, 7802.63] + - [611, 7802.63] - - [1856, 256, 1, 1280] - - [438, 6150.73] + - [616, 6150.73] - - [64, 5888, 1, 3328] - - [405, 5301.49] + - [583, 5301.49] - - [2368, 2368, 1, 1280] - - [435, 8233.43] + - [613, 8233.43] - - [2944, 5888, 1, 128] - - [428, 3745.51] + - [606, 3745.51] - - [704, 5888, 1, 1280] - - [433, 8245.04] + - [611, 8245.04] - - [2368, 3584, 1, 128] - - [425, 4523.43] + - [603, 4523.43] - - [1856, 5056, 1, 128] - - [422, 4498.08] + - [600, 4498.08] - - [704, 1024, 1, 1280] - - [446, 5479.59] + - [624, 5479.59] - - [448, 256, 1, 3328] - - [389, 5048.8] + - [567, 5048.8] - - [448, 1856, 1, 128] - - [422, 2936.92] + - [600, 2936.92] - - [8192, 3200, 1, 2048] - - [431, 6713.12] + - [609, 6713.12] - - [128, 1024, 1, 128] - - [367, 998.744] + - [545, 998.744] - - [2944, 4, 1, 128] - - [452, 98.7471] + - [630, 98.7471] - - [1024, 704, 1, 1280] - - [438, 5897.0] + - [616, 5897.0] - - [128, 5888, 1, 256] - - [438, 5014.08] + - [616, 5014.08] - - [1024, 5056, 1, 1280] - - [437, 8857.81] + - [615, 8857.81] - - [4288, 1024, 1, 256] - - [443, 6195.39] + - [621, 6195.39] - - [2944, 2368, 1, 128] - - [421, 4442.23] + - [599, 4442.23] - - [704, 704, 1, 3328] - - [438, 6764.4] + - [616, 6764.4] - - [704, 1408, 1, 1280] - - [439, 7383.58] + - [617, 7383.58] - - [5888, 448, 1, 1280] - - [437, 7299.49] + - [615, 7299.49] - - [3584, 256, 1, 3328] - - [435, 7061.72] + - [613, 7061.72] - - [704, 5888, 1, 3328] - - [439, 8142.42] + - [617, 8142.42] - - [704, 1856, 1, 128] - - [425, 3139.14] + - [603, 3139.14] - - [448, 448, 1, 3328] - - [403, 5063.34] + - [581, 5063.34] - - [4, 4288, 1, 128] - - [453, 64.9775] + - [631, 64.9775] - - [128, 704, 1, 1280] - - [373, 3400.55] + - [551, 3400.55] - - [3584, 2944, 1, 256] - - [443, 7982.14] + - [621, 7982.14] - - [3584, 4, 1, 128] - - [452, 105.318] + - [630, 105.318] - - [1856, 128, 1, 3328] - - [404, 5442.19] + - [582, 5442.19] - - [4, 64, 1, 1280] - - [458, 42.3268] + - [636, 42.3268] - - [2944, 448, 1, 128] - - [421, 2926.95] + - [599, 2926.95] - - [128, 2944, 1, 1280] - - [432, 5109.69] + - [610, 5109.69] - - [64, 64, 1, 3328] - - [400, 1252.99] + - [578, 1252.99] - - [448, 2944, 1, 1280] - - [441, 6684.47] + - [619, 6684.47] - - [512, 24000, 1, 2048] - - [437, 7939.03] + - [615, 7939.03] - - [128, 256, 1, 3328] - - [418, 3276.9] + - [596, 3276.9] - - [1408, 5056, 1, 3328] - - [443, 8959.21] + - [621, 8959.21] - - [1856, 1856, 1, 3328] - - [433, 8006.17] + - [611, 8006.17] - - [3584, 128, 1, 256] - - [438, 4292.52] + - [616, 4292.52] - - [2560, 800, 1, 2560] - - [433, 6262.48] + - [611, 6262.48] - - [448, 1408, 1, 3328] - - [449, 4997.35] + - [627, 4997.35] - - [2368, 2368, 1, 256] - - [451, 4978.94] + - [629, 4978.94] - - [4288, 4288, 1, 1280] - - [430, 8617.78] + - [608, 8617.78] - - [64, 448, 1, 1280] - - [376, 2057.28] + - [554, 2057.28] - - [5888, 1024, 1, 1280] - - [448, 6848.17] + - [626, 6848.17] - - [1408, 4288, 1, 256] - - [431, 7077.01] + - [609, 7077.01] - - [448, 4, 1, 256] - - [456, 84.4294] + - [634, 84.4294] - - [5888, 448, 1, 128] - - [425, 3493.91] + - [603, 3493.91] - - [512, 48000, 1, 2560] - - [443, 8960.13] + - [621, 8960.13] - - [35, 8457, 1, 1760] - - [345, 3934.78] + - [523, 3934.78] - - [704, 6784, 1, 3328] - - [430, 8180.88] + - [608, 8180.88] - - [2560, 6400, 1, 2560] - - [431, 7822.24] + - [609, 7822.24] - - [5056, 1024, 1, 1280] - - [433, 8357.38] + - [611, 8357.38] - - [448, 5888, 1, 3328] - - [437, 7505.28] + - [615, 7505.28] - - [128, 4, 1, 128] - - [452, 0.662251] + - [630, 0.662251] - - [1024, 2944, 1, 1280] - - [437, 8406.24] + - [615, 8406.24] - - [5056, 5888, 1, 1280] - - [437, 8819.76] + - [615, 8819.76] - - [4288, 5888, 1, 128] - - [422, 3522.32] + - [600, 3522.32] - - [256, 3584, 1, 256] - - [433, 5883.89] + - [611, 5883.89] - - [1408, 3584, 1, 128] - - [421, 4283.41] + - [599, 4283.41] - - [256, 2944, 1, 3328] - - [441, 5670.63] + - [619, 5670.63] - - [448, 3584, 1, 128] - - [425, 3171.72] + - [603, 3171.72] - - [5888, 2944, 1, 1280] - - [443, 8198.86] + - [621, 8198.86] - - [4, 6784, 1, 1280] - - [390, 553.896] + - [568, 553.896] - - [2368, 5888, 1, 128] - - [421, 4787.32] + - [599, 4787.32] - - [8448, 16, 1, 2816] - - [380, 2452.63] + - [558, 2452.63] - - [64, 2944, 1, 128] - - [353, 1376.66] + - [531, 1376.66] - - [2368, 4, 1, 256] - - [375, 278.177] + - [553, 278.177] - - [3584, 5888, 1, 256] - - [451, 6233.66] + - [629, 6233.66] - - [2368, 1024, 1, 128] - - [422, 3781.51] + - [600, 3781.51] - - [2368, 704, 1, 128] - - [422, 3198.32] + - [600, 3198.32] - - [3584, 2944, 1, 1280] - - [433, 8045.68] + - [611, 8045.68] - - [3584, 2368, 1, 128] - - [422, 4188.57] + - [600, 4188.57] - - [5056, 704, 1, 128] - - [425, 4019.21] + - [603, 4019.21] - - [448, 2368, 1, 128] - - [427, 2522.21] + - [605, 2522.21] - - [5056, 1408, 1, 3328] - - [435, 8349.93] + - [613, 8349.93] - - [1408, 704, 1, 256] - - [441, 4741.42] + - [619, 4741.42] - - [6784, 1024, 1, 3328] - - [443, 8769.5] + - [621, 8769.5] - - [6784, 2944, 1, 3328] - - [440, 7319.74] + - [618, 7319.74] - - [2944, 5056, 1, 3328] - - [430, 8889.76] + - [608, 8889.76] - - [1856, 1856, 1, 256] - - [433, 6309.84] + - [611, 6309.84] - - [1024, 5888, 1, 128] - - [424, 3759.6] + - [602, 3759.6] - - [6784, 2368, 1, 1280] - - [433, 8298.4] + - [611, 8298.4] - - [256, 4, 1, 128] - - [452, 7.10171] + - [630, 7.10171] - - [4288, 5888, 1, 1280] - - [437, 8365.28] + - [615, 8365.28] - - [4288, 4288, 1, 256] - - [437, 6513.78] + - [615, 6513.78] - - [8448, 32, 1, 2816] - - [408, 4257.74] + - [586, 4257.74] - - [448, 2944, 1, 3328] - - [441, 6875.62] + - [619, 6875.62] - - [5888, 4, 1, 128] - - [452, 163.94] + - [630, 163.94] - - [4288, 1856, 1, 1280] - - [437, 8402.91] + - [615, 8402.91] - - [1856, 2944, 1, 3328] - - [437, 6612.21] + - [615, 6612.21] - - [256, 6784, 1, 3328] - - [438, 7358.7] + - [616, 7358.7] - - [64, 5888, 1, 256] - - [432, 3359.05] + - [610, 3359.05] - - [256, 5056, 1, 128] - - [425, 2489.21] + - [603, 2489.21] - - [5056, 1024, 1, 256] - - [443, 8077.87] + - [621, 8077.87] - - [704, 64, 1, 3328] - - [387, 3288.4] + - [565, 3288.4] - - [5056, 1856, 1, 3328] - - [441, 8171.13] + - [619, 8171.13] - - [4, 2944, 1, 3328] - - [400, 546.843] + - [578, 546.843] - - [4, 5056, 1, 256] - - [375, 378.561] + - [553, 378.561] - - [1856, 1408, 1, 256] - - [443, 6320.88] + - [621, 6320.88] - - [8448, 12000, 1, 2816] - - [441, 7365.87] + - [619, 7365.87] - - [6784, 128, 1, 3328] - - [438, 6366.57] + - [616, 6366.57] - - [4288, 1408, 1, 128] - - [421, 4451.7] + - [599, 4451.7] - - [1856, 5888, 1, 3328] - - [439, 8619.76] + - [617, 8619.76] - - [4288, 5056, 1, 256] - - [443, 7289.05] + - [621, 7289.05] - - [1408, 128, 1, 1280] - - [381, 4291.15] + - [559, 4291.15] - - [4096, 800, 1, 1024] - - [432, 5867.89] + - [610, 5867.89] - - [5056, 256, 1, 3328] - - [438, 7527.61] + - [616, 7527.61] - - [704, 704, 1, 256] - - [438, 4417.85] + - [616, 4417.85] - - [1024, 5888, 1, 1280] - - [443, 8674.57] + - [621, 8674.57] - - [6784, 2368, 1, 128] - - [421, 4724.08] + - [599, 4724.08] - - [4, 5056, 1, 1280] - - [390, 540.307] + - [568, 540.307] - - [256, 64, 1, 1280] - - [392, 1515.38] + - [570, 1515.38] - - [128, 1856, 1, 1280] - - [432, 4574.21] + - [610, 4574.21] - - [1856, 1024, 1, 1280] - - [437, 7741.61] + - [615, 7741.61] - - [6784, 4288, 1, 1280] - - [443, 8521.29] + - [621, 8521.29] - - [2560, 64, 1, 2560] - - [374, 3504.7] + - [552, 3504.7] - - [1856, 1856, 1, 1280] - - [433, 7779.31] + - [611, 7779.31] - - [4096, 400, 1, 1024] - - [443, 4157.81] + - [621, 4157.81] - - [3072, 24000, 1, 1024] - - [443, 8663.45] + - [621, 8663.45] - - [128, 4288, 1, 3328] - - [389, 5674.23] + - [567, 5674.23] - - [4, 2368, 1, 3328] - - [400, 525.48] + - [578, 525.48] - - [5888, 1856, 1, 128] - - [425, 4099.74] + - [603, 4099.74] - - [448, 704, 1, 1280] - - [438, 4309.47] + - [616, 4309.47] - - [128, 5056, 1, 1280] - - [381, 5068.46] + - [559, 5068.46] - - [1024, 448, 1, 3328] - - [441, 6077.82] + - [619, 6077.82] - - [1856, 704, 1, 1280] - - [449, 6257.49] + - [627, 6257.49] - - [5056, 3584, 1, 128] - - [422, 4598.52] + - [600, 4598.52] - - [5888, 5888, 1, 3328] - - [443, 8058.25] + - [621, 8058.25] - - [6784, 1024, 1, 256] - - [443, 5120.99] + - [621, 5120.99] - - [2944, 2368, 1, 256] - - [434, 6523.03] + - [612, 6523.03] - - [256, 448, 1, 256] - - [384, 1816.94] + - [562, 1816.94] - - [5056, 5888, 1, 3328] - - [436, 6722.41] + - [614, 6722.41] - - [1856, 1024, 1, 256] - - [443, 6632.31] + - [621, 6632.31] - - [512, 48000, 1, 1536] - - [437, 8556.01] + - [615, 8556.01] - - [3584, 448, 1, 1280] - - [432, 6567.09] + - [610, 6567.09] - - [8448, 5984, 1, 2816] - - [437, 8990.66] + - [615, 8990.66] - - [448, 5888, 1, 256] - - [437, 6220.47] + - [615, 6220.47] - - [704, 64, 1, 128] - - [350, 450.66] + - [528, 450.66] - - [1408, 6784, 1, 3328] - - [430, 8478.68] + - [608, 8478.68] - - [448, 1024, 1, 128] - - [429, 1844.33] + - [607, 1844.33] - - [4288, 704, 1, 128] - - [425, 3895.26] + - [603, 3895.26] - - [128, 1856, 1, 128] - - [356, 1456.46] + - [534, 1456.46] - - [448, 2368, 1, 3328] - - [435, 5538.04] + - [613, 5538.04] - - [5056, 64, 1, 128] - - [421, 1648.94] + - [599, 1648.94] - - [5056, 2944, 1, 256] - - [437, 8230.87] + - [615, 8230.87] - - [6784, 5888, 1, 128] - - [421, 4873.19] + - [599, 4873.19] - - [1024, 700, 1, 512] - - [435, 4445.37] + - [613, 4445.37] - - [704, 1024, 1, 256] - - [433, 4707.99] + - [611, 4707.99] - - [1024, 4, 1, 256] - - [375, 174.863] + - [553, 174.863] - - [2944, 704, 1, 128] - - [425, 3483.42] + - [603, 3483.42] - - [128, 6784, 1, 1280] - - [433, 6522.93] + - [611, 6522.93] - - [1408, 3584, 1, 3328] - - [437, 8673.59] + - [615, 8673.59] - - [2368, 6784, 1, 256] - - [433, 7941.76] + - [611, 7941.76] - - [5056, 1408, 1, 1280] - - [437, 8801.01] + - [615, 8801.01] - - [256, 256, 1, 128] - - [362, 551.982] + - [540, 551.982] - - [5056, 4288, 1, 128] - - [429, 3793.64] + - [607, 3793.64] - - [1408, 1856, 1, 128] - - [421, 3067.74] + - [599, 3067.74] - - [1408, 5888, 1, 3328] - - [437, 9148.97] + - [615, 9148.97] - - [1856, 256, 1, 256] - - [433, 4319.52] + - [611, 4319.52] - - [6784, 6784, 1, 256] - - [433, 7668.53] + - [611, 7668.53] - - [64, 256, 1, 128] - - [367, 131.172] + - [545, 131.172] - - [4288, 2368, 1, 128] - - [422, 4582.99] + - [600, 4582.99] - - [256, 4288, 1, 1280] - - [432, 6058.61] + - [610, 6058.61] - - [2368, 2944, 1, 256] - - [437, 8016.07] + - [615, 8016.07] - - [4, 1856, 1, 256] - - [454, 252.832] + - [632, 252.832] - - [3584, 1856, 1, 1280] - - [433, 7760.24] + - [611, 7760.24] - - [6784, 6784, 1, 128] - - [422, 4970.14] + - [600, 4970.14] - - [256, 1856, 1, 128] - - [428, 1580.59] + - [606, 1580.59] - - [704, 64, 1, 1280] - - [417, 2556.47] + - [595, 2556.47] - - [5888, 5056, 1, 256] - - [437, 8216.67] + - [615, 8216.67] - - [8448, 48000, 1, 2816] - - [443, 4082.89] + - [621, 4082.89] - - [3584, 448, 1, 256] - - [437, 5518.92] + - [615, 5518.92] - - [448, 4288, 1, 128] - - [425, 3415.25] + - [603, 3415.25] - - [7680, 64, 1, 2560] - - [386, 5162.1] + - [564, 5162.1] - - [256, 6784, 1, 256] - - [437, 6272.62] + - [615, 6272.62] - - [1408, 4288, 1, 128] - - [425, 4343.63] + - [603, 4343.63] - - [2944, 704, 1, 3328] - - [432, 7679.71] + - [610, 7679.71] - - [128, 448, 1, 256] - - [372, 1422.59] + - [550, 1422.59] - - [5056, 256, 1, 1280] - - [439, 5052.39] + - [617, 5052.39] - - [2560, 32, 1, 2560] - - [395, 3106.07] + - [573, 3106.07] - - [3584, 3584, 1, 256] - - [443, 8260.57] + - [621, 8260.57] - - [448, 1408, 1, 128] - - [421, 2397.38] + - [599, 2397.38] - - [128, 256, 1, 1280] - - [376, 2340.67] + - [554, 2340.67] - - [3584, 5056, 1, 256] - - [443, 7347.56] + - [621, 7347.56] - - [6784, 128, 1, 256] - - [433, 5591.1] + - [611, 5591.1] - - [4288, 4, 1, 256] - - [375, 354.206] + - [553, 354.206] - - [704, 448, 1, 256] - - [438, 3492.33] + - [616, 3492.33] - - [2944, 2368, 1, 1280] - - [445, 6661.71] + - [623, 6661.71] - - [448, 64, 1, 3328] - - [417, 3058.45] + - [595, 3058.45] - - [1408, 3584, 1, 256] - - [443, 7966.59] + - [621, 7966.59] - - [3584, 4, 1, 3328] - - [456, 605.559] + - [634, 605.559] - - [6784, 3584, 1, 256] - - [433, 7525.41] + - [611, 7525.41] - - [256, 128, 1, 128] - - [365, 276.041] + - [543, 276.041] - - [704, 1408, 1, 128] - - [422, 3109.85] + - [600, 3109.85] - - [4, 2368, 1, 256] - - [456, 283.375] + - [634, 283.375] - - [4288, 128, 1, 1280] - - [438, 5132.65] + - [616, 5132.65] - - [128, 1408, 1, 256] - - [432, 2733.35] + - [610, 2733.35] - - [4, 2944, 1, 256] - - [454, 314.127] + - [632, 314.127] - - [64, 128, 1, 3328] - - [402, 1514.71] + - [580, 1514.71] - - [5056, 2368, 1, 128] - - [426, 3449.17] + - [604, 3449.17] - - [2944, 2944, 1, 3328] - - [430, 8169.03] + - [608, 8169.03] - - [5056, 6784, 1, 256] - - [450, 5792.77] + - [628, 5792.77] - - [1856, 3584, 1, 128] - - [427, 4213.5] + - [605, 4213.5] - - [128, 2944, 1, 128] - - [351, 1970.46] + - [529, 1970.46] - - [35, 8457, 1, 2560] - - [346, 3525.15] + - [524, 3525.15] - - [1024, 704, 1, 3328] - - [432, 6784.99] + - [610, 6784.99] - - [6784, 448, 1, 256] - - [441, 6544.88] + - [619, 6544.88] - - [3584, 6784, 1, 128] - - [421, 4623.6] + - [599, 4623.6] - - [128, 4288, 1, 256] - - [435, 3606.6] + - [613, 3606.6] - - [704, 448, 1, 3328] - - [432, 4478.01] + - [610, 4478.01] - - [128, 128, 1, 3328] - - [417, 2177.65] + - [595, 2177.65] - - [5056, 1856, 1, 256] - - [451, 5608.72] + - [629, 5608.72] - - [4608, 5984, 1, 1536] - - [440, 7859.85] + - [618, 7859.85] - - [256, 128, 1, 256] - - [376, 998.744] + - [554, 998.744] - - [1760, 3200, 1, 1760] - - [433, 8179.64] + - [611, 8179.64] - - [1024, 1856, 1, 256] - - [443, 6143.27] + - [621, 6143.27] - - [4096, 1600, 1, 1024] - - [451, 5851.52] + - [629, 5851.52] - - [4288, 64, 1, 128] - - [356, 1372.26] + - [534, 1372.26] - - [256, 448, 1, 3328] - - [395, 4795.1] + - [573, 4795.1] - - [1408, 6784, 1, 1280] - - [437, 8426.5] + - [615, 8426.5] - - [3584, 3584, 1, 1280] - - [437, 7556.56] + - [615, 7556.56] - - [7680, 24000, 1, 2560] - - [430, 5019.19] + - [608, 5019.19] - - [64, 2368, 1, 1280] - - [381, 4061.8] + - [559, 4061.8] - - [448, 2368, 1, 1280] - - [432, 5928.77] + - [610, 5928.77] - - [4608, 48000, 1, 1536] - - [437, 6937.4] + - [615, 6937.4] - - [5888, 5888, 1, 128] - - [422, 3744.0] + - [600, 3744.0] - - [64, 6784, 1, 3328] - - [432, 5988.72] + - [610, 5988.72] - - [2944, 256, 1, 1280] - - [438, 6717.97] + - [616, 6717.97] - - [2048, 16, 1, 2048] - - [390, 1210.58] + - [568, 1210.58] - - [256, 2368, 1, 128] - - [425, 1936.07] + - [603, 1936.07] - - [5056, 2368, 1, 3328] - - [443, 8875.63] + - [621, 8875.63] - - [2944, 4288, 1, 256] - - [437, 8063.24] + - [615, 8063.24] - - [1408, 3584, 1, 1280] - - [433, 8197.07] + - [611, 8197.07] - - [2368, 64, 1, 256] - - [432, 2365.79] + - [610, 2365.79] - - [64, 448, 1, 3328] - - [418, 3027.4] + - [596, 3027.4] - - [704, 128, 1, 3328] - - [389, 4452.19] + - [567, 4452.19] - - [8192, 1600, 1, 2048] - - [437, 7229.93] + - [615, 7229.93] - - [1856, 704, 1, 256] - - [439, 5545.45] + - [617, 5545.45] - - [4, 4288, 1, 1280] - - [390, 523.825] + - [568, 523.825] - - [1408, 448, 1, 3328] - - [444, 4789.4] + - [622, 4789.4] - - [1024, 4, 1, 3328] - - [370, 504.223] + - [548, 504.223] - - [512, 24000, 1, 2560] - - [443, 8903.62] + - [621, 8903.62] - - [2368, 6784, 1, 3328] - - [443, 8311.14] + - [621, 8311.14] - - [1856, 1408, 1, 1280] - - [433, 8160.11] + - [611, 8160.11] - - [1856, 448, 1, 1280] - - [435, 6243.07] + - [613, 6243.07] - - [6784, 704, 1, 128] - - [421, 4069.05] + - [599, 4069.05] - - [4, 4, 1, 256] - - [390, 0.842029] + - [568, 0.842029] - - [128, 5888, 1, 128] - - [421, 2328.02] + - [599, 2328.02] - - [1408, 5888, 1, 256] - - [432, 6986.91] + - [610, 6986.91] - - [704, 2944, 1, 1280] - - [433, 7905.03] + - [611, 7905.03] - - [4288, 64, 1, 1280] - - [408, 3828.27] + - [586, 3828.27] - - [256, 64, 1, 256] - - [383, 655.46] + - [561, 655.46] - - [704, 1856, 1, 256] - - [441, 5444.37] + - [619, 5444.37] - - [704, 6784, 1, 128] - - [421, 4319.77] + - [599, 4319.77] - - [3584, 704, 1, 1280] - - [441, 7726.43] + - [619, 7726.43] - - [256, 128, 1, 1280] - - [376, 2184.63] + - [554, 2184.63] - - [5888, 2368, 1, 256] - - [443, 8192.69] + - [621, 8192.69] - - [256, 2368, 1, 1280] - - [438, 5675.54] + - [616, 5675.54] - - [2944, 6784, 1, 128] - - [426, 4248.35] + - [604, 4248.35] - - [3584, 448, 1, 3328] - - [437, 6560.77] + - [615, 6560.77] - - [1408, 4, 1, 256] - - [455, 176.79] + - [633, 176.79] - - [704, 2368, 1, 3328] - - [438, 7085.31] + - [616, 7085.31] - - [2944, 448, 1, 256] - - [434, 3412.0] + - [612, 3412.0] - - [1856, 448, 1, 128] - - [422, 2748.82] + - [600, 2748.82] - - [4288, 4, 1, 3328] - - [390, 553.648] + - [568, 553.648] - - [2368, 128, 1, 1280] - - [411, 4173.65] + - [589, 4173.65] - - [256, 5888, 1, 128] - - [426, 2860.98] + - [604, 2860.98] - - [64, 6784, 1, 256] - - [439, 3637.18] + - [617, 3637.18] - - [64, 5056, 1, 1280] - - [438, 4289.53] + - [616, 4289.53] - - [4, 6784, 1, 128] - - [452, 160.906] + - [630, 160.906] - - [2048, 3200, 1, 512] - - [439, 6927.09] + - [617, 6927.09] - - [2944, 2944, 1, 1280] - - [431, 6267.85] + - [609, 6267.85] - - [5056, 448, 1, 3328] - - [432, 7400.36] + - [610, 7400.36] - - [4, 3584, 1, 1280] - - [390, 499.83] + - [568, 499.83] - - [1408, 128, 1, 128] - - [367, 1037.36] + - [545, 1037.36] - - [6784, 704, 1, 3328] - - [438, 7633.95] + - [616, 7633.95] - - [128, 64, 1, 1280] - - [390, 1170.39] + - [568, 1170.39] - - [2368, 256, 1, 1280] - - [438, 5609.89] + - [616, 5609.89] - - [4, 448, 1, 3328] - - [458, 358.5] + - [636, 358.5] - - [5888, 4288, 1, 128] - - [426, 4521.74] + - [604, 4521.74] - - [4, 5888, 1, 256] - - [390, 353.933] + - [568, 353.933] - - [1408, 2944, 1, 3328] - - [431, 8951.41] + - [609, 8951.41] - - [3584, 704, 1, 128] - - [421, 3395.41] + - [599, 3395.41] - - [4608, 12000, 1, 1536] - - [430, 6609.99] + - [608, 6609.99] - - [64, 1024, 1, 256] - - [376, 1588.85] + - [554, 1588.85] - - [5056, 5056, 1, 128] - - [421, 4080.81] + - [599, 4080.81] - - [2368, 448, 1, 1280] - - [432, 5423.04] + - [610, 5423.04] - - [128, 3584, 1, 256] - - [438, 4705.25] + - [616, 4705.25] - - [704, 448, 1, 1280] - - [435, 3961.07] + - [613, 3961.07] - - [8192, 800, 1, 2048] - - [433, 6306.36] + - [611, 6306.36] - - [448, 5056, 1, 128] - - [425, 3709.56] + - [603, 3709.56] - - [256, 4, 1, 1280] - - [457, 163.94] + - [635, 163.94] - - [5056, 3584, 1, 256] - - [430, 7008.34] + - [608, 7008.34] - - [2368, 4, 1, 3328] - - [390, 496.366] + - [568, 496.366] - - [1408, 5056, 1, 128] - - [425, 4175.37] + - [603, 4175.37] - - [2944, 3584, 1, 128] - - [421, 4659.79] + - [599, 4659.79] - - [3584, 2368, 1, 256] - - [443, 5851.87] + - [621, 5851.87] - - [128, 3584, 1, 3328] - - [433, 6105.04] + - [611, 6105.04] - - [128, 1024, 1, 1280] - - [373, 3848.09] + - [551, 3848.09] - - [8448, 24000, 1, 2816] - - [443, 5128.64] + - [621, 5128.64] - - [64, 704, 1, 256] - - [376, 1253.83] + - [554, 1253.83] - - [4288, 256, 1, 1280] - - [432, 5625.86] + - [610, 5625.86] - - [3584, 3584, 1, 3328] - - [437, 8206.15] + - [615, 8206.15] - - [4, 704, 1, 128] - - [452, 29.5484] + - [630, 29.5484] - - [5888, 6784, 1, 256] - - [439, 8248.75] + - [617, 8248.75] - - [4288, 2944, 1, 3328] - - [437, 8657.12] + - [615, 8657.12] - - [2944, 64, 1, 128] - - [356, 1240.7] + - [534, 1240.7] - - [1024, 128, 1, 3328] - - [381, 4433.1] + - [559, 4433.1] - - [1024, 16, 1, 500000] - - [344, 2571.15] + - [522, 2571.15] - - [4288, 128, 1, 3328] - - [381, 5716.85] + - [559, 5716.85] - - [7680, 128, 1, 2560] - - [379, 5488.1] + - [557, 5488.1] - - [256, 5056, 1, 1280] - - [439, 6380.06] + - [617, 6380.06] - - [1408, 256, 1, 128] - - [425, 1633.83] + - [603, 1633.83] - - [2944, 5888, 1, 3328] - - [434, 7849.02] + - [612, 7849.02] - - [6784, 5888, 1, 1280] - - [443, 9047.72] + - [621, 9047.72] - - [2048, 800, 1, 512] - - [438, 4841.17] + - [616, 4841.17] - - [704, 128, 1, 256] - - [383, 1567.27] + - [561, 1567.27] - - [5888, 4288, 1, 1280] - - [437, 7982.93] + - [615, 7982.93] - - [1024, 24000, 1, 2048] - - [439, 5774.4] + - [617, 5774.4] - - [448, 256, 1, 1280] - - [373, 3707.19] + - [551, 3707.19] - - [5888, 3584, 1, 128] - - [426, 3804.5] + - [604, 3804.5] - - [1024, 2944, 1, 128] - - [421, 3308.36] + - [599, 3308.36] - - [5056, 4, 1, 1280] - - [454, 469.062] + - [632, 469.062] - - [256, 1408, 1, 1280] - - [432, 4899.99] + - [610, 4899.99] - - [3072, 16, 1, 1024] - - [390, 1233.72] + - [568, 1233.72] - - [704, 3584, 1, 128] - - [421, 3919.53] + - [599, 3919.53] - - [5888, 448, 1, 3328] - - [451, 6095.71] + - [629, 6095.71] - - [2368, 4288, 1, 1280] - - [433, 8338.4] + - [611, 8338.4] - - [4288, 2944, 1, 128] - - [425, 3946.6] + - [603, 3946.6] - - [1024, 6784, 1, 3328] - - [439, 7494.38] + - [617, 7494.38] - - [128, 2368, 1, 256] - - [438, 2895.42] + - [616, 2895.42] - - [6784, 64, 1, 3328] - - [432, 5964.99] + - [610, 5964.99] - - [5056, 2944, 1, 3328] - - [443, 6605.63] + - [621, 6605.63] - - [448, 128, 1, 256] - - [376, 1339.52] + - [554, 1339.52] - - [2944, 3584, 1, 256] - - [439, 7165.66] + - [617, 7165.66] - - [1408, 1408, 1, 3328] - - [443, 8332.96] + - [621, 8332.96] - - [1856, 128, 1, 1280] - - [438, 4498.43] + - [616, 4498.43] - - [3584, 3584, 1, 128] - - [422, 4000.11] + - [600, 4000.11] - - [64, 3584, 1, 256] - - [449, 2383.23] + - [627, 2383.23] - - [1408, 4, 1, 3328] - - [400, 423.008] + - [578, 423.008] - - [128, 2944, 1, 3328] - - [405, 5430.03] + - [583, 5430.03] - - [3584, 704, 1, 256] - - [438, 6154.09] + - [616, 6154.09] - - [2944, 448, 1, 3328] - - [438, 6507.82] + - [616, 6507.82] - - [3584, 1408, 1, 3328] - - [443, 8829.73] + - [621, 8829.73] - - [704, 3584, 1, 1280] - - [433, 7860.33] + - [611, 7860.33] - - [2944, 6784, 1, 1280] - - [443, 8894.6] + - [621, 8894.6] - - [1856, 6784, 1, 256] - - [443, 8115.19] + - [621, 8115.19] - - [4288, 448, 1, 3328] - - [435, 6397.35] + - [613, 6397.35] - - [6784, 4288, 1, 128] - - [421, 4109.54] + - [599, 4109.54] - - [6784, 704, 1, 1280] - - [431, 7999.14] + - [609, 7999.14] - - [256, 4288, 1, 256] - - [435, 4603.94] + - [613, 4603.94] - - [3584, 6784, 1, 256] - - [443, 7361.65] + - [621, 7361.65] - - [6144, 12000, 1, 2048] - - [442, 6311.76] + - [620, 6311.76] - - [6144, 16, 1, 2560] - - [391, 2240.65] + - [569, 2240.65] - - [3584, 64, 1, 128] - - [362, 1292.36] + - [540, 1292.36] - - [5888, 1024, 1, 3328] - - [430, 8394.59] + - [608, 8394.59] - - [448, 64, 1, 128] - - [353, 262.244] + - [531, 262.244] - - [704, 6784, 1, 1280] - - [437, 7740.66] + - [615, 7740.66] - - [4, 1024, 1, 1280] - - [390, 378.921] + - [568, 378.921] - - [5888, 128, 1, 256] - - [438, 5003.68] + - [616, 5003.68] - - [4096, 16, 1, 4096] - - [390, 1585.85] + - [568, 1585.85] - - [1856, 5056, 1, 3328] - - [431, 8522.92] + - [609, 8522.92] - - [4, 6784, 1, 256] - - [375, 387.757] + - [553, 387.757] - - [1024, 3584, 1, 128] - - [425, 3031.61] + - [603, 3031.61] - - [1024, 1408, 1, 128] - - [427, 2600.85] + - [605, 2600.85] - - [2368, 2944, 1, 128] - - [424, 4340.26] + - [602, 4340.26] - - [5056, 64, 1, 256] - - [438, 3109.62] + - [616, 3109.62] - - [4, 448, 1, 1280] - - [458, 253.835] + - [636, 253.835] - - [5056, 2944, 1, 128] - - [429, 3740.01] + - [607, 3740.01] - - [5888, 5056, 1, 3328] - - [443, 9016.48] + - [621, 9016.48] - - [1024, 704, 1, 128] - - [425, 2363.66] + - [603, 2363.66] - - [5888, 2368, 1, 128] - - [428, 3651.83] + - [606, 3651.83] - - [128, 5056, 1, 3328] - - [432, 6243.64] + - [610, 6243.64] - - [3584, 6784, 1, 1280] - - [430, 9080.67] + - [608, 9080.67] - - [448, 4, 1, 1280] - - [458, 243.083] + - [636, 243.083] - - [1856, 5888, 1, 256] - - [443, 8182.12] + - [621, 8182.12] - - [256, 256, 1, 256] - - [376, 1542.12] + - [554, 1542.12] - - [256, 64, 1, 128] - - [357, 135.226] + - [535, 135.226] - - [4288, 4288, 1, 3328] - - [443, 8674.64] + - [621, 8674.64] - - [4288, 1408, 1, 1280] - - [431, 7867.18] + - [609, 7867.18] - - [3584, 5056, 1, 128] - - [421, 4457.83] + - [599, 4457.83] - - [4, 1024, 1, 3328] - - [370, 440.394] + - [548, 440.394] - - [4288, 2368, 1, 256] - - [451, 5699.57] + - [629, 5699.57] - - [2944, 5056, 1, 1280] - - [443, 8236.56] + - [621, 8236.56] - - [448, 6784, 1, 256] - - [433, 6620.62] + - [611, 6620.62] - - [64, 128, 1, 128] - - [358, 67.6629] + - [536, 67.6629] - - [1856, 2368, 1, 128] - - [425, 4233.7] + - [603, 4233.7] - - [6784, 2368, 1, 3328] - - [443, 8269.9] + - [621, 8269.9] - - [256, 1024, 1, 1280] - - [432, 4882.88] + - [610, 4882.88] - - [704, 4, 1, 128] - - [452, 19.111] + - [630, 19.111] - - [256, 4, 1, 256] - - [390, 46.9114] + - [568, 46.9114] - - [4288, 128, 1, 256] - - [438, 4273.49] + - [616, 4273.49] - - [4288, 1856, 1, 3328] - - [433, 8195.81] + - [611, 8195.81] - - [3584, 448, 1, 128] - - [426, 2750.65] + - [604, 2750.65] - - [2048, 1600, 1, 2048] - - [449, 5753.59] + - [627, 5753.59] - - [256, 4, 1, 3328] - - [459, 297.978] + - [637, 297.978] - - [4, 1408, 1, 1280] - - [457, 402.386] + - [635, 402.386] - - [3584, 64, 1, 1280] - - [446, 4096.1] + - [624, 4096.1] - - [1408, 448, 1, 128] - - [421, 2498.25] + - [599, 2498.25] - - [3584, 1024, 1, 1280] - - [443, 7252.18] + - [621, 7252.18] - - [1856, 5056, 1, 256] - - [437, 7711.59] + - [615, 7711.59] - - [4, 3584, 1, 256] - - [454, 314.314] + - [632, 314.314] - - [4, 2944, 1, 1280] - - [390, 483.218] + - [568, 483.218] - - [1024, 4288, 1, 256] - - [442, 6544.52] + - [620, 6544.52] - - [5888, 3584, 1, 3328] - - [431, 8105.15] + - [609, 8105.15] - - [1856, 4, 1, 256] - - [390, 252.832] + - [568, 252.832] - - [4, 256, 1, 256] - - [375, 48.2882] + - [553, 48.2882] - - [5056, 3584, 1, 3328] - - [436, 7354.8] + - [614, 7354.8] - - [704, 448, 1, 128] - - [429, 1233.91] + - [607, 1233.91] - - [2368, 1408, 1, 1280] - - [437, 6654.24] + - [615, 6654.24] - - [5056, 2944, 1, 1280] - - [443, 8505.72] + - [621, 8505.72] - - [4, 4, 1, 128] - - [453, 0.1478505] + - [631, 0.1478505] - - [3584, 256, 1, 256] - - [435, 4616.47] + - [613, 4616.47] - - [1024, 6784, 1, 256] - - [437, 7944.98] + - [615, 7944.98] - - [4, 128, 1, 256] - - [390, 29.3571] + - [568, 29.3571] - - [64, 64, 1, 1280] - - [401, 642.61] + - [579, 642.61] - - [5124, 9124, 1, 2048] - - [443, 8019.4] + - [621, 8019.4] - - [6784, 4, 1, 128] - - [452, 193.067] + - [630, 193.067] - - [2944, 1408, 1, 128] - - [421, 3827.13] + - [599, 3827.13] - - [448, 128, 1, 3328] - - [394, 4064.0] + - [572, 4064.0] - - [3584, 1408, 1, 1280] - - [443, 7180.83] + - [621, 7180.83] - - [64, 4288, 1, 3328] - - [389, 4786.84] + - [567, 4786.84] - - [5056, 6784, 1, 3328] - - [430, 7889.83] + - [608, 7889.83] - - [128, 2944, 1, 256] - - [433, 3599.69] + - [611, 3599.69] - - [128, 6784, 1, 128] - - [351, 2606.79] + - [529, 2606.79] - - [3584, 4288, 1, 256] - - [437, 7299.81] + - [615, 7299.81] - - [448, 1856, 1, 256] - - [433, 5207.07] + - [611, 5207.07] - - [1856, 6784, 1, 3328] - - [435, 8386.36] + - [613, 8386.36] - - [3584, 128, 1, 3328] - - [379, 5590.04] + - [557, 5590.04] - - [64, 1856, 1, 256] - - [372, 1949.38] + - [550, 1949.38] - - [64, 448, 1, 256] - - [377, 955.833] + - [555, 955.833] - - [5888, 4288, 1, 256] - - [441, 7791.84] + - [619, 7791.84] - - [4, 448, 1, 128] - - [452, 8.84146] + - [630, 8.84146] - - [5056, 1408, 1, 256] - - [443, 5154.01] + - [621, 5154.01] - - [35, 8457, 1, 2048] - - [348, 3182.57] + - [526, 3182.57] - - [64, 256, 1, 1280] - - [397, 1713.46] + - [575, 1713.46] - - [3584, 1024, 1, 256] - - [433, 6528.18] + - [611, 6528.18] - - [256, 704, 1, 256] - - [432, 2720.46] + - [610, 2720.46] - - [5888, 5888, 1, 256] - - [441, 7992.26] + - [619, 7992.26] - - [4288, 1024, 1, 1280] - - [435, 7837.5] + - [613, 7837.5] - - [5888, 128, 1, 3328] - - [438, 7181.13] + - [616, 7181.13] - - [448, 6784, 1, 3328] - - [432, 7663.1] + - [610, 7663.1] - - [2944, 1408, 1, 1280] - - [441, 7903.14] + - [619, 7903.14] - - [64, 128, 1, 1280] - - [390, 1191.66] + - [568, 1191.66] - - [2944, 1856, 1, 3328] - - [431, 7844.41] + - [609, 7844.41] - - [2368, 64, 1, 128] - - [362, 997.973] + - [540, 997.973] - - [256, 1024, 1, 128] - - [421, 1215.84] + - [599, 1215.84] - - [3584, 5888, 1, 1280] - - [430, 8958.94] + - [608, 8958.94] - - [64, 4, 1, 128] - - [453, 1.21608] + - [631, 1.21608] - - [6784, 1856, 1, 1280] - - [430, 6728.8] + - [608, 6728.8] - - [2944, 5056, 1, 256] - - [443, 8275.21] + - [621, 8275.21] - - [4288, 4, 1, 128] - - [452, 147.644] + - [630, 147.644] - - [5888, 256, 1, 3328] - - [439, 7094.2] + - [617, 7094.2] - - [2944, 4288, 1, 128] - - [424, 4611.55] + - [602, 4611.55] - - [3584, 1408, 1, 256] - - [434, 6543.06] + - [612, 6543.06] - - [704, 3584, 1, 3328] - - [433, 8117.2] + - [611, 8117.2] - - [4096, 3200, 1, 1024] - - [448, 6656.13] + - [626, 6656.13] - - [5056, 448, 1, 1280] - - [446, 6096.2] + - [624, 6096.2] - - [3584, 1856, 1, 3328] - - [431, 8552.41] + - [609, 8552.41] - - [4288, 6784, 1, 1280] - - [437, 8212.46] + - [615, 8212.46] - - [2560, 7000, 1, 2560] - - [439, 7655.34] + - [617, 7655.34] - - [1408, 704, 1, 1280] - - [435, 5756.79] + - [613, 5756.79] - - [2944, 1024, 1, 256] - - [443, 6880.91] + - [621, 6880.91] - - [6784, 64, 1, 256] - - [438, 4438.96] + - [616, 4438.96] - - [2368, 4288, 1, 3328] - - [439, 8377.99] + - [617, 8377.99] - - [4, 1408, 1, 256] - - [456, 222.599] + - [634, 222.599] - - [1024, 1408, 1, 1280] - - [433, 6339.38] + - [611, 6339.38] - - [64, 64, 1, 256] - - [390, 187.346] + - [568, 187.346] - - [704, 256, 1, 3328] - - [432, 4046.14] + - [610, 4046.14] - - [6784, 5056, 1, 256] - - [443, 7972.17] + - [621, 7972.17] - - [1856, 1856, 1, 128] - - [427, 3716.61] + - [605, 3716.61] - - [3584, 5056, 1, 3328] - - [443, 8684.76] + - [621, 8684.76] - - [448, 6784, 1, 128] - - [425, 3829.05] + - [603, 3829.05] - - [4, 704, 1, 3328] - - [458, 393.206] + - [636, 393.206] - - [35, 8457, 1, 4096] - - [347, 3173.24] + - [525, 3173.24] - - [448, 2944, 1, 256] - - [441, 5553.41] + - [619, 5553.41] - - [4, 4288, 1, 3328] - - [400, 573.211] + - [578, 573.211] - - [2944, 6784, 1, 256] - - [437, 8566.06] + - [615, 8566.06] - - [2944, 2944, 1, 128] - - [421, 4540.83] + - [599, 4540.83] - - [4, 4, 1, 1280] - - [400, 3.14762] + - [578, 3.14762] - - [1856, 3584, 1, 1280] - - [437, 7306.36] + - [615, 7306.36] - - [64, 2944, 1, 256] - - [449, 2292.61] + - [627, 2292.61] - - [448, 256, 1, 128] - - [358, 797.93] + - [536, 797.93] - - [4288, 448, 1, 128] - - [424, 3430.5] + - [602, 3430.5] - - [4608, 24000, 1, 1536] - - [442, 6820.24] + - [620, 6820.24] - - [1856, 1408, 1, 3328] - - [445, 6600.24] + - [623, 6600.24] - - [128, 128, 1, 128] - - [350, 161.917] + - [528, 161.917] - - [1024, 4288, 1, 3328] - - [433, 7937.08] + - [611, 7937.08] - - [448, 2368, 1, 256] - - [441, 4526.45] + - [619, 4526.45] - - [1024, 4, 1, 128] - - [453, 16.9907] + - [631, 16.9907] - - [64, 1408, 1, 1280] - - [373, 3345.32] + - [551, 3345.32] - - [64, 6784, 1, 1280] - - [438, 5526.6] + - [616, 5526.6] - - [5056, 448, 1, 256] - - [432, 4216.65] + - [610, 4216.65] - - [2944, 2368, 1, 3328] - - [443, 7000.42] + - [621, 7000.42] - - [704, 4288, 1, 3328] - - [449, 6414.43] + - [627, 6414.43] - - [1408, 128, 1, 256] - - [432, 2720.46] + - [610, 2720.46] - - [1024, 1856, 1, 1280] - - [443, 7682.93] + - [621, 7682.93] - - [2048, 6400, 1, 2048] - - [439, 7418.22] + - [617, 7418.22] - - [512, 48000, 1, 2816] - - [443, 8884.77] + - [621, 8884.77] - - [5124, 9124, 1, 2560] - - [435, 6040.8] + - [613, 6040.8] - - [128, 2368, 1, 3328] - - [389, 5025.66] + - [567, 5025.66] - - [1024, 5888, 1, 256] - - [437, 7322.21] + - [615, 7322.21] - - [64, 2944, 1, 1280] - - [373, 4222.31] + - [551, 4222.31] - - [5056, 64, 1, 3328] - - [414, 4936.32] + - [592, 4936.32] - - [128, 704, 1, 128] - - [359, 683.414] + - [537, 683.414] - - [1408, 2368, 1, 256] - - [438, 6404.22] + - [616, 6404.22] - - [1408, 1408, 1, 256] - - [443, 4537.93] + - [621, 4537.93] - - [4, 64, 1, 128] - - [452, 2.56747] + - [630, 2.56747] - - [64, 1024, 1, 128] - - [351, 532.372] + - [529, 532.372] - - [1024, 8, 1, 500000] - - [341, 1685.08] + - [519, 1685.08] - - [2368, 2368, 1, 128] - - [422, 4334.33] + - [600, 4334.33] - - [64, 5888, 1, 128] - - [351, 2003.19] + - [529, 2003.19] - - [5888, 4, 1, 3328] - - [369, 339.118] + - [547, 339.118] - - [6784, 1408, 1, 128] - - [425, 4431.23] + - [603, 4431.23] - - [4288, 5888, 1, 256] - - [443, 7800.88] + - [621, 7800.88] - - [1408, 5056, 1, 256] - - [437, 8153.38] + - [615, 8153.38] - - [5056, 128, 1, 3328] - - [394, 5829.93] + - [572, 5829.93] - - [128, 128, 1, 1280] - - [397, 1691.35] + - [575, 1691.35] - - [448, 704, 1, 256] - - [438, 3364.28] + - [616, 3364.28] - - [4288, 3584, 1, 128] - - [422, 2952.68] + - [600, 2952.68] - - [2944, 128, 1, 3328] - - [394, 5620.82] + - [572, 5620.82] - - [64, 1408, 1, 3328] - - [395, 4169.91] + - [573, 4169.91] - - [3584, 5056, 1, 1280] - - [440, 7780.76] + - [618, 7780.76] - - [256, 448, 1, 1280] - - [373, 3929.45] + - [551, 3929.45] - - [704, 704, 1, 128] - - [421, 2346.17] + - [599, 2346.17] - - [5056, 4, 1, 128] - - [452, 144.557] + - [630, 144.557] - - [704, 256, 1, 1280] - - [441, 2283.22] + - [619, 2283.22] - - [64, 2368, 1, 3328] - - [373, 4921.69] + - [551, 4921.69] - - [1856, 1024, 1, 128] - - [422, 3459.57] + - [600, 3459.57] - - [1856, 64, 1, 128] - - [354, 918.237] + - [532, 918.237] - - [4096, 64, 1, 4096] - - [399, 4000.62] + - [577, 4000.62] - - [1024, 24000, 1, 1536] - - [435, 8502.36] + - [613, 8502.36] - - [704, 4288, 1, 256] - - [439, 6003.83] + - [617, 6003.83] - - [5888, 2368, 1, 1280] - - [430, 8801.3] + - [608, 8801.3] - - [128, 256, 1, 256] - - [384, 1070.08] + - [562, 1070.08] - - [64, 128, 1, 256] - - [390, 374.591] + - [568, 374.591] - - [2368, 5888, 1, 1280] - - [433, 8308.63] + - [611, 8308.63] - - [5888, 256, 1, 1280] - - [441, 7154.42] + - [619, 7154.42] - - [1760, 128, 1, 1760] - - [382, 5363.91] + - [560, 5363.91] - - [4, 5888, 1, 1280] - - [390, 542.304] + - [568, 542.304] - - [704, 128, 1, 128] - - [362, 779.447] + - [540, 779.447] - - [1024, 4, 1, 1280] - - [390, 392.531] + - [568, 392.531] - - [2368, 1856, 1, 3328] - - [433, 7975.32] + - [611, 7975.32] - - [2368, 128, 1, 128] - - [355, 1584.96] + - [533, 1584.96] - - [2944, 704, 1, 256] - - [441, 4039.21] + - [619, 4039.21] - - [5056, 128, 1, 128] - - [421, 2575.89] + - [599, 2575.89] - - [2368, 1024, 1, 3328] - - [449, 6165.54] + - [627, 6165.54] - - [256, 704, 1, 3328] - - [432, 4028.74] + - [610, 4028.74] - - [704, 3584, 1, 256] - - [443, 6102.92] + - [621, 6102.92] - - [704, 2944, 1, 3328] - - [433, 8202.84] + - [611, 8202.84] - - [6784, 1024, 1, 128] - - [425, 4386.4] + - [603, 4386.4] - - [256, 448, 1, 128] - - [362, 834.195] + - [540, 834.195] - - [448, 1024, 1, 3328] - - [450, 5412.48] + - [628, 5412.48] - - [2944, 1024, 1, 3328] - - [443, 6265.87] + - [621, 6265.87] - - [2944, 5056, 1, 128] - - [421, 4770.88] + - [599, 4770.88] - - [2368, 256, 1, 256] - - [438, 3975.23] + - [616, 3975.23] - - [1408, 6784, 1, 256] - - [437, 7987.02] + - [615, 7987.02] - - [6784, 1408, 1, 3328] - - [437, 8472.71] + - [615, 8472.71] - - [4288, 6784, 1, 128] - - [428, 3865.2] + - [606, 3865.2] - - [704, 64, 1, 256] - - [376, 1287.41] + - [554, 1287.41] - - [5888, 4, 1, 1280] - - [375, 510.022] + - [553, 510.022] - - [256, 2368, 1, 3328] - - [438, 5837.65] + - [616, 5837.65] - - [6784, 2944, 1, 1280] - - [443, 8560.54] + - [621, 8560.54] - - [4288, 1856, 1, 128] - - [421, 4617.07] + - [599, 4617.07] - - [1856, 2944, 1, 128] - - [421, 4287.73] + - [599, 4287.73] - - [6784, 448, 1, 128] - - [425, 3893.43] + - [603, 3893.43] - - [64, 3584, 1, 128] - - [351, 1609.76] + - [529, 1609.76] - - [448, 5056, 1, 1280] - - [441, 7124.41] + - [619, 7124.41] - - [2368, 1856, 1, 128] - - [424, 4004.65] + - [602, 4004.65] - - [64, 2944, 1, 3328] - - [374, 5086.48] + - [552, 5086.48] - - [4288, 704, 1, 256] - - [439, 6176.57] + - [617, 6176.57] - - [256, 3584, 1, 128] - - [422, 2553.15] + - [600, 2553.15] - - [5888, 704, 1, 256] - - [438, 6781.51] + - [616, 6781.51] - - [3584, 1024, 1, 128] - - [425, 3660.95] + - [603, 3660.95] - - [256, 5888, 1, 3328] - - [441, 7772.13] + - [619, 7772.13] - - [1408, 4288, 1, 3328] - - [437, 8832.86] + - [615, 8832.86] - - [6784, 4288, 1, 256] - - [443, 8566.14] + - [621, 8566.14] - - [4288, 256, 1, 128] - - [423, 1953.79] + - [601, 1953.79] - - [5888, 256, 1, 256] - - [441, 3730.53] + - [619, 3730.53] - - [6784, 1024, 1, 1280] - - [437, 8578.39] + - [615, 8578.39] - - [5888, 1024, 1, 128] - - [422, 4092.96] + - [600, 4092.96] - - [1024, 128, 1, 256] - - [372, 1897.98] + - [550, 1897.98] - - [512, 16, 1, 500000] - - [343, 2363.79] + - [521, 2363.79] - - [128, 64, 1, 3328] - - [400, 1592.56] + - [578, 1592.56] - - [448, 64, 1, 256] - - [390, 976.168] + - [568, 976.168] - - [2368, 256, 1, 128] - - [425, 2094.99] + - [603, 2094.99] - - [6784, 3584, 1, 1280] - - [437, 8570.16] + - [615, 8570.16] - - [1024, 6784, 1, 1280] - - [443, 8203.57] + - [621, 8203.57] - - [2944, 64, 1, 1280] - - [381, 4300.61] + - [559, 4300.61] - - [1408, 2944, 1, 1280] - - [433, 7349.64] + - [611, 7349.64] - - [256, 1856, 1, 256] - - [432, 4649.75] + - [610, 4649.75] - - [2048, 800, 1, 2048] - - [451, 4668.73] + - [629, 4668.73] - - [1408, 2368, 1, 3328] - - [441, 7537.74] + - [619, 7537.74] - - [2944, 4, 1, 3328] - - [390, 514.142] + - [568, 514.142] - - [128, 1408, 1, 3328] - - [382, 4991.64] + - [560, 4991.64] - - [2944, 1856, 1, 128] - - [421, 4317.39] + - [599, 4317.39] - - [256, 2944, 1, 128] - - [421, 2258.27] + - [599, 2258.27] - - [256, 6784, 1, 128] - - [421, 3147.02] + - [599, 3147.02] - - [2368, 4, 1, 128] - - [453, 33.9286] + - [631, 33.9286] - - [1408, 256, 1, 3328] - - [432, 5077.85] + - [610, 5077.85] - - [1856, 4, 1, 128] - - [453, 21.5025] + - [631, 21.5025] - - [5056, 6784, 1, 128] - - [421, 4945.11] + - [599, 4945.11] - - [4288, 5056, 1, 128] - - [424, 4729.87] + - [602, 4729.87] - - [1856, 5888, 1, 128] - - [421, 4707.96] + - [599, 4707.96] - - [2944, 5888, 1, 256] - - [435, 8014.78] + - [613, 8014.78] - - [3584, 1856, 1, 256] - - [437, 7567.13] + - [615, 7567.13] - - [4288, 3584, 1, 1280] - - [430, 8726.43] + - [608, 8726.43] - - [2368, 448, 1, 256] - - [438, 4227.7] + - [616, 4227.7] - - [4288, 256, 1, 3328] - - [439, 5487.41] + - [617, 5487.41] - - [1856, 704, 1, 128] - - [425, 3125.06] + - [603, 3125.06] - - [1408, 64, 1, 256] - - [385, 1620.09] + - [563, 1620.09] - - [64, 1856, 1, 128] - - [349, 955.147] + - [527, 955.147] - - [4, 256, 1, 128] - - [452, 10.8789] + - [630, 10.8789] - - [2560, 16, 1, 2560] - - [397, 2019.7] + - [575, 2019.7] - - [704, 5888, 1, 128] - - [426, 3976.26] + - [604, 3976.26] - - [6784, 3584, 1, 128] - - [425, 4018.91] + - [603, 4018.91] - - [1024, 64, 1, 256] - - [390, 1370.79] + - [568, 1370.79] - - [64, 2368, 1, 256] - - [432, 2255.76] + - [610, 2255.76] - - [4288, 5056, 1, 3328] - - [437, 8368.69] + - [615, 8368.69] - - [4, 1856, 1, 1280] - - [390, 392.126] + - [568, 392.126] - - [4288, 128, 1, 128] - - [355, 2287.03] + - [533, 2287.03] - - [1408, 1408, 1, 128] - - [425, 3233.48] + - [603, 3233.48] - - [7680, 16, 1, 2560] - - [393, 2257.37] + - [571, 2257.37] - - [1856, 128, 1, 128] - - [355, 1532.8] + - [533, 1532.8] - - [5056, 2368, 1, 256] - - [437, 8167.29] + - [615, 8167.29] - - [4288, 704, 1, 3328] - - [443, 6411.16] + - [621, 6411.16] - - [448, 3584, 1, 256] - - [443, 5477.74] + - [621, 5477.74] - - [2368, 64, 1, 1280] - - [373, 3936.52] + - [551, 3936.52] - - [2368, 1024, 1, 1280] - - [439, 7688.82] + - [617, 7688.82] - - [2944, 1408, 1, 3328] - - [430, 7668.78] + - [608, 7668.78] - - [1408, 448, 1, 256] - - [432, 4863.98] + - [610, 4863.98] - - [1024, 1408, 1, 3328] - - [441, 7448.99] + - [619, 7448.99] - - [2944, 5888, 1, 1280] - - [431, 8208.57] + - [609, 8208.57] - - [1408, 4, 1, 1280] - - [370, 479.419] + - [548, 479.419] - - [5888, 3584, 1, 256] - - [431, 8610.09] + - [609, 8610.09] - - [2368, 5056, 1, 128] - - [428, 3726.25] + - [606, 3726.25] - - [1408, 1856, 1, 3328] - - [432, 7829.48] + - [610, 7829.48] - - [4, 4, 1, 3328] - - [459, 4.39419] + - [637, 4.39419] - - [6784, 1408, 1, 1280] - - [432, 7690.8] + - [610, 7690.8] - - [4096, 7000, 1, 4096] - - [444, 6272.49] + - [622, 6272.49] - - [704, 2944, 1, 256] - - [433, 6095.91] + - [611, 6095.91] - - [4288, 64, 1, 256] - - [398, 2121.31] + - [576, 2121.31] - - [6784, 5888, 1, 3328] - - [437, 8955.6] + - [615, 8955.6] - - [2368, 4288, 1, 128] - - [421, 4699.65] + - [599, 4699.65] - - [64, 4288, 1, 1280] - - [411, 4013.73] + - [589, 4013.73] - - [6784, 64, 1, 1280] - - [432, 5418.83] + - [610, 5418.83] - - [3584, 128, 1, 128] - - [361, 2165.3] + - [539, 2165.3] - - [1024, 6784, 1, 128] - - [422, 3765.3] + - [600, 3765.3] - - [4, 1856, 1, 128] - - [453, 33.3728] + - [631, 33.3728] - - [1408, 64, 1, 3328] - - [394, 4489.51] + - [572, 4489.51] - - [6784, 4, 1, 256] - - [390, 400.262] + - [568, 400.262] - - [1408, 1408, 1, 1280] - - [437, 8139.53] + - [615, 8139.53] - - [16384, 400, 1, 4096] - - [441, 6087.28] + - [619, 6087.28] - - [256, 2368, 1, 256] - - [432, 4766.35] + - [610, 4766.35] - - [448, 4288, 1, 3328] - - [439, 7577.08] + - [617, 7577.08] - - [2368, 1408, 1, 256] - - [435, 5284.53] + - [613, 5284.53] - - [5888, 5056, 1, 128] - - [422, 3643.6] + - [600, 3643.6] - - [704, 2368, 1, 256] - - [437, 5334.73] + - [615, 5334.73] - - [1024, 24000, 1, 2560] - - [445, 7438.06] + - [623, 7438.06] - - [2944, 448, 1, 1280] - - [446, 4937.53] + - [624, 4937.53] - - [5888, 2368, 1, 3328] - - [431, 8201.84] + - [609, 8201.84] - - [5124, 9124, 1, 1760] - - [438, 6764.06] + - [616, 6764.06] - - [448, 1408, 1, 1280] - - [432, 5881.54] + - [610, 5881.54] - - [448, 1856, 1, 1280] - - [439, 6225.56] + - [617, 6225.56] - - [4288, 448, 1, 1280] - - [441, 5626.37] + - [619, 5626.37] - - [5888, 704, 1, 3328] - - [435, 7873.62] + - [613, 7873.62] - - [5056, 256, 1, 128] - - [426, 2921.03] + - [604, 2921.03] - - [1856, 256, 1, 128] - - [428, 1995.42] + - [606, 1995.42] - - [64, 1408, 1, 128] - - [349, 758.938] + - [527, 758.938] - - [704, 4, 1, 256] - - [390, 130.697] + - [568, 130.697] - - [1408, 5888, 1, 128] - - [421, 4574.05] + - [599, 4574.05] - - [7680, 12000, 1, 2560] - - [437, 8747.13] + - [615, 8747.13] - - [1408, 1024, 1, 256] - - [434, 4609.23] + - [612, 4609.23] - - [8192, 400, 1, 2048] - - [446, 5283.25] + - [624, 5283.25] - - [1024, 1856, 1, 128] - - [421, 2686.38] + - [599, 2686.38] - - [256, 704, 1, 128] - - [421, 1004.83] + - [599, 1004.83] - - [2560, 128, 1, 2560] - - [399, 4259.14] + - [577, 4259.14] - - [448, 1024, 1, 256] - - [432, 4813.24] + - [610, 4813.24] - - [128, 4, 1, 3328] - - [458, 128.408] + - [636, 128.408] - - [5056, 6784, 1, 1280] - - [440, 6579.85] + - [618, 6579.85] - - [1408, 64, 1, 128] - - [362, 819.3] + - [540, 819.3] - - [1024, 448, 1, 1280] - - [441, 5703.31] + - [619, 5703.31] - - [704, 5056, 1, 3328] - - [433, 7574.49] + - [611, 7574.49] - - [128, 5056, 1, 256] - - [432, 5113.53] + - [610, 5113.53] - - [64, 1024, 1, 3328] - - [417, 3980.1] + - [595, 3980.1] - - [1856, 4, 1, 3328] - - [371, 433.253] + - [549, 433.253] - - [4, 2944, 1, 128] - - [453, 46.6225] + - [631, 46.6225] - - [2368, 2944, 1, 3328] - - [431, 9002.13] + - [609, 9002.13] - - [448, 448, 1, 1280] - - [373, 3969.52] + - [551, 3969.52] - - [2368, 3584, 1, 256] - - [443, 7806.39] + - [621, 7806.39] - - [5056, 3584, 1, 1280] - - [430, 8971.56] + - [608, 8971.56] - - [5124, 9124, 1, 4096] - - [443, 7208.72] + - [621, 7208.72] - - [7680, 48000, 1, 2560] - - [437, 3835.91] + - [615, 3835.91] - - [448, 4, 1, 3328] - - [458, 409.7] + - [636, 409.7] - - [1856, 2944, 1, 1280] - - [430, 7173.71] + - [608, 7173.71] - - [1024, 48000, 1, 2816] - - [437, 8976.26] + - [615, 8976.26] - - [128, 1024, 1, 256] - - [376, 1969.26] + - [554, 1969.26] - - [2944, 1408, 1, 256] - - [439, 4585.12] + - [617, 4585.12] - - [4288, 1408, 1, 3328] - - [433, 8237.27] + - [611, 8237.27] - - [3584, 64, 1, 3328] - - [379, 5183.16] + - [557, 5183.16] - - [5888, 2944, 1, 128] - - [428, 3674.56] + - [606, 3674.56] - - [2944, 1024, 1, 128] - - [425, 3834.32] + - [603, 3834.32] - - [4288, 5056, 1, 1280] - - [437, 8086.1] + - [615, 8086.1] - - [5888, 6784, 1, 1280] - - [431, 6941.32] + - [609, 6941.32] - - [6784, 5056, 1, 128] - - [422, 4860.15] + - [600, 4860.15] - - [256, 1024, 1, 3328] - - [446, 5156.22] + - [624, 5156.22] - - [3584, 4, 1, 256] - - [390, 332.529] + - [568, 332.529] - - [1760, 1600, 1, 1760] - - [433, 6330.76] + - [611, 6330.76] - - [1856, 64, 1, 3328] - - [394, 4756.03] + - [572, 4756.03] - - [4, 128, 1, 3328] - - [458, 160.244] + - [636, 160.244] - - [5888, 1408, 1, 3328] - - [431, 8722.74] + - [609, 8722.74] - - [448, 2944, 1, 128] - - [424, 2997.63] + - [602, 2997.63] - - [2368, 1856, 1, 256] - - [432, 6662.34] + - [610, 6662.34] - - [256, 5056, 1, 256] - - [434, 5256.29] + - [612, 5256.29] - - [128, 3584, 1, 128] - - [353, 2073.56] + - [531, 2073.56] - - [448, 3584, 1, 3328] - - [430, 6833.96] + - [608, 6833.96] - - [4, 5056, 1, 3328] - - [400, 581.523] + - [578, 581.523] - - [704, 2368, 1, 128] - - [421, 3402.29] + - [599, 3402.29] - - [5888, 256, 1, 128] - - [426, 2977.54] + - [604, 2977.54] - - [4, 5056, 1, 128] - - [452, 65.2074] + - [630, 65.2074] - - [448, 256, 1, 256] - - [438, 1764.53] + - [616, 1764.53] - - [704, 4, 1, 3328] - - [390, 398.554] + - [568, 398.554] - - [1408, 256, 1, 256] - - [433, 3463.86] + - [611, 3463.86] - - [3584, 1856, 1, 128] - - [429, 3228.19] + - [607, 3228.19] - - [4288, 4288, 1, 128] - - [425, 4853.93] + - [603, 4853.93] - - [1856, 1024, 1, 3328] - - [449, 5994.68] + - [627, 5994.68] - - [128, 5888, 1, 3328] - - [403, 6512.85] + - [581, 6512.85] - - [1024, 5056, 1, 256] - - [443, 7859.42] + - [621, 7859.42] - - [5888, 5888, 1, 1280] - - [443, 8131.44] + - [621, 8131.44] - - [5056, 5888, 1, 128] - - [422, 4920.71] + - [600, 4920.71] - - [2368, 1408, 1, 3328] - - [441, 7110.74] + - [619, 7110.74] - - [1024, 48000, 1, 1536] - - [441, 8590.82] + - [619, 8590.82] - - [5888, 448, 1, 256] - - [442, 3567.74] + - [620, 3567.74] - - [2560, 3200, 1, 2560] - - [432, 7638.31] + - [610, 7638.31] - - [5888, 6784, 1, 128] - - [422, 3910.92] + - [600, 3910.92] - - [6144, 48000, 1, 2048] - - [443, 3412.95] + - [621, 3412.95] - - [6784, 5056, 1, 1280] - - [434, 7890.22] + - [612, 7890.22] - - [5056, 704, 1, 1280] - - [438, 7665.06] + - [616, 7665.06] - - [1024, 48000, 1, 2560] - - [443, 8188.5] + - [621, 8188.5] - - [4608, 32, 1, 1536] - - [411, 2856.97] + - [589, 2856.97] - - [1024, 2368, 1, 128] - - [421, 3019.35] + - [599, 3019.35] - - [128, 704, 1, 256] - - [372, 1696.33] + - [550, 1696.33] - - [2368, 448, 1, 3328] - - [438, 5799.29] + - [616, 5799.29] - - [128, 5888, 1, 1280] - - [432, 6680.75] + - [610, 6680.75] - - [16384, 800, 1, 4096] - - [437, 6322.22] + - [615, 6322.22] - - [448, 128, 1, 1280] - - [411, 2849.49] + - [589, 2849.49] - - [6784, 4, 1, 3328] - - [390, 563.12] + - [568, 563.12] - - [5888, 5056, 1, 1280] - - [437, 8631.33] + - [615, 8631.33] - - [1024, 64, 1, 3328] - - [412, 3481.96] + - [590, 3481.96] - - [3072, 48000, 1, 1024] - - [437, 9019.49] + - [615, 9019.49] - - [64, 3584, 1, 1280] - - [374, 4327.95] + - [552, 4327.95] - - [6784, 1408, 1, 256] - - [437, 6320.59] + - [615, 6320.59] - - [3584, 5888, 1, 128] - - [424, 4406.79] + - [602, 4406.79] - - [5056, 5888, 1, 256] - - [443, 8037.13] + - [621, 8037.13] - - [2368, 1024, 1, 256] - - [435, 4936.14] + - [613, 4936.14] - - [2944, 1856, 1, 256] - - [443, 7222.32] + - [621, 7222.32] - - [1856, 6784, 1, 1280] - - [433, 8251.81] + - [611, 8251.81] - - [64, 5056, 1, 128] - - [353, 1643.7] + - [531, 1643.7] - - [64, 6784, 1, 128] - - [351, 1929.77] + - [529, 1929.77] - - [448, 704, 1, 128] - - [423, 979.959] + - [601, 979.959] - - [4, 1024, 1, 128] - - [452, 20.1416] + - [630, 20.1416] - - [4288, 3584, 1, 256] - - [437, 8444.14] + - [615, 8444.14] - - [1408, 704, 1, 128] - - [421, 3021.0] + - [599, 3021.0] - - [64, 256, 1, 3328] - - [417, 2227.47] + - [595, 2227.47] - - [6784, 448, 1, 3328] - - [443, 6573.11] + - [621, 6573.11] - - [5056, 1856, 1, 1280] - - [435, 7976.23] + - [613, 7976.23] - - [1408, 1024, 1, 3328] - - [433, 7470.33] + - [611, 7470.33] - - [2368, 256, 1, 3328] - - [438, 5394.37] + - [616, 5394.37] - - [5888, 3584, 1, 1280] - - [430, 9031.55] + - [608, 9031.55] - - [1856, 3584, 1, 3328] - - [445, 7272.6] + - [623, 7272.6] - - [5888, 128, 1, 1280] - - [438, 6684.48] + - [616, 6684.48] - - [1024, 2944, 1, 256] - - [443, 7415.09] + - [621, 7415.09] - - [448, 6784, 1, 1280] - - [439, 7923.78] + - [617, 7923.78] - - [256, 3584, 1, 1280] - - [435, 6901.87] + - [613, 6901.87] - - [704, 5056, 1, 256] - - [440, 5004.55] + - [618, 5004.55] - - [3584, 1024, 1, 3328] - - [432, 7894.63] + - [610, 7894.63] - - [2944, 1856, 1, 1280] - - [437, 7903.27] + - [615, 7903.27] - - [128, 256, 1, 128] - - [350, 325.745] + - [528, 325.745] - - [5056, 256, 1, 256] - - [434, 3356.56] + - [612, 3356.56] - - [2944, 4288, 1, 3328] - - [443, 7813.93] + - [621, 7813.93] - - [2368, 3584, 1, 3328] - - [443, 8371.09] + - [621, 8371.09] - - [2944, 704, 1, 1280] - - [449, 5514.09] + - [627, 5514.09] - - [128, 4, 1, 256] - - [390, 25.3062] + - [568, 25.3062] - - [2944, 3584, 1, 1280] - - [437, 7738.83] + - [615, 7738.83] - - [1856, 5888, 1, 1280] - - [431, 8584.63] + - [609, 8584.63] - - [256, 256, 1, 1280] - - [411, 2962.18] + - [589, 2962.18] - - [2048, 3200, 1, 2048] - - [439, 6911.69] + - [617, 6911.69] - - [4288, 1408, 1, 256] - - [437, 7954.0] + - [615, 7954.0] - - [3584, 64, 1, 256] - - [438, 2780.42] + - [616, 2780.42] - - [64, 1856, 1, 3328] - - [373, 4912.04] + - [551, 4912.04] - - [256, 1408, 1, 128] - - [421, 1373.24] + - [599, 1373.24] - - [5888, 1408, 1, 128] - - [426, 4242.01] + - [604, 4242.01] - - [4288, 2368, 1, 1280] - - [435, 8012.7] + - [613, 8012.7] - - [4, 4288, 1, 256] - - [456, 301.674] + - [634, 301.674] - - [256, 4288, 1, 128] - - [421, 2706.36] + - [599, 2706.36] - - [2048, 128, 1, 2048] - - [416, 2885.26] + - [594, 2885.26] - - [256, 128, 1, 3328] - - [418, 3170.21] + - [596, 3170.21] - - [512, 8, 1, 500000] - - [342, 1915.12] + - [520, 1915.12] - - [6784, 2368, 1, 256] - - [437, 8323.66] + - [615, 8323.66] - - [5888, 128, 1, 128] - - [425, 2466.08] + - [603, 2466.08] - - [1024, 24000, 1, 2816] - - [435, 8131.64] + - [613, 8131.64] - - [7680, 5984, 1, 2560] - - [439, 6040.77] + - [617, 6040.77] - - [4288, 1856, 1, 256] - - [451, 5818.53] + - [629, 5818.53] - - [1856, 256, 1, 3328] - - [432, 6532.03] + - [610, 6532.03] - - [1856, 2944, 1, 256] - - [437, 7312.92] + - [615, 7312.92] - - [5056, 1024, 1, 128] - - [427, 4103.0] + - [605, 4103.0] - - [64, 5888, 1, 1280] - - [432, 5058.25] + - [610, 5058.25] - - [1760, 800, 1, 1760] - - [435, 7280.0] + - [613, 7280.0] - - [6784, 256, 1, 128] - - [425, 3257.69] + - [603, 3257.69] - - [5888, 704, 1, 128] - - [421, 3813.93] + - [599, 3813.93] - - [1408, 2368, 1, 128] - - [422, 3561.27] + - [600, 3561.27] - - [1024, 4288, 1, 1280] - - [441, 7752.74] + - [619, 7752.74] - - [2368, 5056, 1, 3328] - - [444, 7711.91] + - [622, 7711.91] - - [448, 4, 1, 128] - - [452, 18.4795] + - [630, 18.4795] - - [4, 256, 1, 3328] - - [459, 269.71] + - [637, 269.71] - - [4288, 1024, 1, 3328] - - [438, 7910.27] + - [616, 7910.27] - - [6144, 48000, 1, 2560] - - [437, 3541.09] + - [615, 3541.09] - - [1024, 5056, 1, 3328] - - [431, 8509.66] + - [609, 8509.66] - - [1024, 1856, 1, 3328] - - [437, 7907.93] + - [615, 7907.93] - - [704, 704, 1, 1280] - - [449, 5648.15] + - [627, 5648.15] - - [128, 2368, 1, 1280] - - [408, 4145.11] + - [586, 4145.11] - - [1408, 128, 1, 3328] - - [381, 4919.6] + - [559, 4919.6] - - [3584, 256, 1, 1280] - - [433, 5185.56] + - [611, 5185.56] - - [4, 128, 1, 128] - - [452, 3.07891] + - [630, 3.07891] - - [5888, 64, 1, 1280] - - [381, 4499.59] + - [559, 4499.59] - - [3584, 128, 1, 1280] - - [438, 5929.01] + - [616, 5929.01] - - [4, 256, 1, 1280] - - [457, 170.767] + - [635, 170.767] - - [128, 704, 1, 3328] - - [381, 4379.37] + - [559, 4379.37] - - [4288, 6784, 1, 256] - - [431, 7181.09] + - [609, 7181.09] - - [3584, 2944, 1, 3328] - - [437, 8553.3] + - [615, 8553.3] - - [128, 1856, 1, 256] - - [438, 3207.77] + - [616, 3207.77] - - [64, 4288, 1, 256] - - [432, 2907.99] + - [610, 2907.99] - - [4, 3584, 1, 3328] - - [390, 560.605] + - [568, 560.605] - - [64, 4, 1, 3328] - - [459, 67.5025] + - [637, 67.5025] - - [4, 64, 1, 3328] - - [459, 88.8467] + - [637, 88.8467] - - [5888, 2944, 1, 256] - - [437, 7255.77] + - [615, 7255.77] - - [1856, 64, 1, 256] - - [383, 1743.72] + - [561, 1743.72] - - [5056, 128, 1, 1280] - - [438, 6009.79] + - [616, 6009.79] - - [448, 4288, 1, 1280] - - [439, 6466.82] + - [617, 6466.82] - - [448, 1856, 1, 3328] - - [439, 6381.99] + - [617, 6381.99] - - [1024, 4288, 1, 128] - - [424, 3491.87] + - [602, 3491.87] - - [4, 1024, 1, 256] - - [457, 172.563] + - [635, 172.563] - - [5056, 4288, 1, 256] - - [437, 8241.52] + - [615, 8241.52] - - [1024, 448, 1, 256] - - [441, 4218.51] + - [619, 4218.51] - - [1024, 3584, 1, 256] - - [437, 6513.69] + - [615, 6513.69] - - [2944, 128, 1, 1280] - - [381, 4710.48] + - [559, 4710.48] - - [2048, 32, 1, 2048] - - [396, 1779.23] + - [574, 1779.23] - - [64, 256, 1, 256] - - [390, 655.46] + - [568, 655.46] - - [1408, 4, 1, 128] - - [453, 20.1249] + - [631, 20.1249] - - [128, 2368, 1, 128] - - [353, 1707.73] + - [531, 1707.73] - - [256, 704, 1, 1280] - - [432, 3735.31] + - [610, 3735.31] - - [64, 2368, 1, 128] - - [360, 1049.81] + - [538, 1049.81] - - [6784, 6784, 1, 3328] - - [437, 9277.94] + - [615, 9277.94] - - [448, 5888, 1, 1280] - - [443, 7319.75] + - [621, 7319.75] - - [5056, 448, 1, 128] - - [425, 3694.43] + - [603, 3694.43] - - [4288, 704, 1, 1280] - - [435, 7890.96] + - [613, 7890.96] - - [3584, 2944, 1, 128] - - [427, 4124.71] + - [605, 4124.71] - - [6784, 256, 1, 1280] - - [443, 7185.83] + - [621, 7185.83] - - [256, 2944, 1, 1280] - - [432, 6736.76] + - [610, 6736.76] - - [64, 4288, 1, 128] - - [351, 1614.41] + - [529, 1614.41] - - [2368, 5888, 1, 3328] - - [433, 8616.46] + - [611, 8616.46] - - [4, 64, 1, 256] - - [370, 11.4778] + - [548, 11.4778] - - [704, 1024, 1, 3328] - - [438, 6801.92] + - [616, 6801.92] - - [2368, 1856, 1, 1280] - - [435, 7853.57] + - [613, 7853.57] - - [448, 5056, 1, 3328] - - [438, 7453.04] + - [616, 7453.04] - - [128, 448, 1, 128] - - [353, 530.449] + - [531, 530.449] - - [128, 6784, 1, 256] - - [433, 5557.55] + - [611, 5557.55] - - [3584, 4288, 1, 128] - - [424, 4462.73] + - [602, 4462.73] - - [64, 448, 1, 128] - - [353, 278.132] + - [531, 278.132] - - [5888, 4288, 1, 3328] - - [430, 9153.55] + - [608, 9153.55] - - [2368, 704, 1, 256] - - [437, 5350.78] + - [615, 5350.78] - - [256, 1856, 1, 3328] - - [432, 6536.35] + - [610, 6536.35] - - [1856, 128, 1, 256] - - [446, 2847.36] + - [624, 2847.36] - - [6784, 128, 1, 128] - - [426, 2530.82] + - [604, 2530.82] - - [3584, 1408, 1, 128] - - [427, 3625.62] + - [605, 3625.62] - - [1856, 5056, 1, 1280] - - [433, 8123.39] + - [611, 8123.39] - - [2944, 1024, 1, 1280] - - [443, 8450.41] + - [621, 8450.41] - - [5056, 4, 1, 256] - - [457, 380.787] + - [635, 380.787] - - [3584, 5888, 1, 3328] - - [435, 8567.99] + - [613, 8567.99] - - [2368, 4288, 1, 256] - - [439, 7858.07] + - [617, 7858.07] - - [1024, 2368, 1, 3328] - - [433, 6776.45] + - [611, 6776.45] - - [64, 704, 1, 3328] - - [388, 3503.52] + - [566, 3503.52] - - [704, 1408, 1, 256] - - [433, 6099.99] + - [611, 6099.99] - - [4096, 128, 1, 4096] - - [413, 4116.57] + - [591, 4116.57] - - [1024, 3584, 1, 1280] - - [443, 7231.65] + - [621, 7231.65] - - [4288, 5888, 1, 3328] - - [437, 8762.42] + - [615, 8762.42] - - [4288, 4, 1, 1280] - - [390, 492.797] + - [568, 492.797] - - [4608, 16, 1, 1536] - - [391, 1892.58] + - [569, 1892.58] - - [5888, 64, 1, 128] - - [368, 1747.73] + - [546, 1747.73] - - [4, 5888, 1, 128] - - [453, 84.5915] + - [631, 84.5915] - - [1024, 2944, 1, 3328] - - [441, 6907.05] + - [619, 6907.05] - - [6784, 1856, 1, 256] - - [437, 6274.07] + - [615, 6274.07] - - [2048, 64, 1, 2048] - - [420, 2371.44] + - [598, 2371.44] - - [256, 6784, 1, 1280] - - [437, 7067.04] + - [615, 7067.04] - - [1856, 3584, 1, 256] - - [443, 7706.87] + - [621, 7706.87] - - [128, 448, 1, 3328] - - [388, 3995.93] + - [566, 3995.93] - - [6784, 1856, 1, 128] - - [425, 4459.09] + - [603, 4459.09] - - [4, 448, 1, 256] - - [390, 84.4294] + - [568, 84.4294] - - [5056, 128, 1, 256] - - [438, 4954.5] + - [616, 4954.5] - - [512, 24000, 1, 2816] - - [431, 8994.98] + - [609, 8994.98] - - [256, 5888, 1, 1280] - - [430, 6184.0] + - [608, 6184.0] - - [4, 128, 1, 1280] - - [458, 71.9597] + - [636, 71.9597] - - [16384, 1600, 1, 4096] - - [437, 6921.09] + - [615, 6921.09] - - [6784, 128, 1, 1280] - - [441, 6486.37] + - [619, 6486.37] - - [64, 1408, 1, 256] - - [378, 1647.86] + - [556, 1647.86] - - [2368, 1408, 1, 128] - - [425, 3937.1] + - [603, 3937.1] - - [1856, 448, 1, 256] - - [438, 4635.57] + - [616, 4635.57] - - [1408, 1024, 1, 128] - - [421, 3208.51] + - [599, 3208.51] - - [128, 64, 1, 128] - - [350, 70.192] + - [528, 70.192] - - [6784, 3584, 1, 3328] - - [443, 8466.28] + - [621, 8466.28] - - [1760, 7000, 1, 1760] - - [441, 8149.21] + - [619, 8149.21] - - [2944, 64, 1, 3328] - - [374, 5018.09] + - [552, 5018.09] - - [64, 64, 1, 128] - - [350, 35.5249] + - [528, 35.5249] - - [2368, 5056, 1, 1280] - - [437, 8764.0] + - [615, 8764.0] - - [64, 4, 1, 1280] - - [459, 43.6745] + - [637, 43.6745] - - [1408, 2368, 1, 1280] - - [438, 7660.38] + - [616, 7660.38] - - [128, 1408, 1, 1280] - - [373, 4185.27] + - [551, 4185.27] - - [256, 64, 1, 3328] - - [398, 2071.75] + - [576, 2071.75] - - [704, 4288, 1, 128] - - [421, 4069.18] + - [599, 4069.18] - - [128, 1856, 1, 3328] - - [404, 5776.15] + - [582, 5776.15] - - [2944, 2944, 1, 256] - - [443, 7949.31] + - [621, 7949.31] - - [2944, 4, 1, 1280] - - [390, 483.218] + - [568, 483.218] - - [5888, 4, 1, 256] - - [375, 396.765] + - [553, 396.765] - - [6784, 256, 1, 256] - - [449, 4044.83] + - [627, 4044.83] - - [256, 5056, 1, 3328] - - [432, 7607.37] + - [610, 7607.37] - - [128, 4288, 1, 1280] - - [373, 4958.78] + - [551, 4958.78] - - [5056, 1856, 1, 128] - - [425, 4560.94] + - [603, 4560.94] - - [5056, 1024, 1, 3328] - - [437, 8634.18] + - [615, 8634.18] - - [128, 128, 1, 256] - - [375, 699.151] + - [553, 699.151] - - [1760, 64, 1, 1760] - - [381, 4580.65] + - [559, 4580.65] - - [4288, 3584, 1, 3328] - - [443, 9143.76] + - [621, 9143.76] - - [448, 704, 1, 3328] - - [432, 4473.43] + - [610, 4473.43] - - [448, 448, 1, 128] - - [363, 1264.38] + - [541, 1264.38] - - [1024, 2368, 1, 1280] - - [441, 7452.51] + - [619, 7452.51] - - [1856, 704, 1, 3328] - - [432, 6103.34] + - [610, 6103.34] - - [4, 2368, 1, 128] - - [452, 96.019] + - [630, 96.019] - - [5888, 6784, 1, 3328] - - [437, 9131.74] + - [615, 9131.74] - - [704, 4288, 1, 1280] - - [439, 7906.46] + - [617, 7906.46] - - [704, 256, 1, 256] - - [432, 2772.78] + - [610, 2772.78] - - [1024, 48000, 1, 2048] - - [436, 6513.45] + - [614, 6513.45] - - [4288, 1024, 1, 128] - - [421, 4291.77] - - - [512, 2048, 1, 49] - - [467, 4555.08] - - - [512, 128, 1, 784] - - [460, 3195.39] - - - [2048, 512, 1, 49] - - [468, 4253.43] - - - [1024, 256, 1, 196] - - [464, 4039.43] + - [599, 4291.77] - - [256, 64, 1, 3136] - - [462, 3015.37] + - [640, 3015.37] - - [256, 1024, 1, 196] - - [466, 4225.45] - - - [64, 256, 1, 3136] - - [463, 3058.45] - - - [128, 512, 1, 784] - - [461, 3380.38] - - - [64, 64, 1, 3136] - - [465, 1372.44] + - [644, 4225.45] - - [1024, 1024, 1, 3328] - - [578, 8705.1] + - [756, 8705.1] - - [2048, 200, 1, 3200] - - [583, 6173.42] + - [761, 6173.42] - - [1024, 200, 1, 13312] - - [481, 5213.31] + - [659, 5213.31] - - [1024, 256, 1, 1536] - - [583, 5859.43] + - [761, 5859.43] - - [4096, 256, 1, 12288] - - [588, 8807.52] + - [766, 8807.52] - - [64, 200, 1, 1024] - - [555, 366.632] + - [733, 366.632] - - [32, 512, 1, 1024] - - [510, 453.049] + - [688, 453.049] - - [2048, 256, 1, 3328] - - [572, 7876.73] + - [750, 7876.73] - - [4096, 512, 1, 32] - - [576, 3975.74] + - [754, 3975.74] - - [2048, 256, 1, 13312] - - [553, 7837.81] + - [731, 7837.81] - - [4096, 200, 1, 11264] - - [588, 6902.76] + - [766, 6902.76] - - [2048, 512, 1, 1024] - - [582, 8100.14] + - [760, 8100.14] - - [2048, 1024, 1, 1664] - - [482, 9082.08] + - [660, 9082.08] - - [1024, 1024, 1, 64] - - [578, 4258.28] + - [756, 4258.28] - - [512, 1024, 1, 1536] - - [572, 7597.33] + - [750, 7597.33] - - [1024, 256, 1, 15360] - - [473, 6735.24] + - [651, 6735.24] - - [1, 512, 1, 1024] - - [523, 15.1657] + - [701, 15.1657] - - [4096, 512, 1, 1408] - - [485, 9024.52] + - [663, 9024.52] - - [1024, 200, 1, 1408] - - [583, 4461.09] + - [761, 4461.09] - - [1024, 512, 1, 512] - - [577, 6528.2] + - [755, 6528.2] - - [4096, 256, 1, 15360] - - [584, 8824.03] + - [762, 8824.03] - - [2048, 512, 1, 640] - - [574, 7989.25] + - [752, 7989.25] - - [4096, 1024, 1, 1280] - - [480, 9421.54] + - [658, 9421.54] - - [1024, 200, 1, 6144] - - [572, 4966.52] + - [750, 4966.52] - - [1024, 1024, 1, 512] - - [574, 7731.54] + - [752, 7731.54] - - [128, 512, 1, 2048] - - [490, 2190.34] + - [668, 2190.34] - - [2048, 1024, 1, 640] - - [480, 8581.8] + - [658, 8581.8] - - [1024, 256, 1, 3328] - - [572, 6192.71] + - [750, 6192.71] - - [4096, 1024, 1, 13312] - - [485, 9642.59] + - [663, 9642.59] - - [2048, 256, 1, 2048] - - [572, 7485.75] + - [750, 7485.75] - - [2048, 1024, 1, 13312] - - [485, 9352.26] + - [663, 9352.26] - - [2048, 512, 1, 16640] - - [573, 8839.17] + - [751, 8839.17] - - [1024, 512, 1, 128] - - [577, 4280.0] + - [755, 4280.0] - - [2048, 1024, 1, 3584] - - [480, 9264.72] + - [658, 9264.72] - - [2048, 512, 1, 256] - - [588, 6990.61] + - [766, 6990.61] - - [512, 256, 1, 3200] - - [535, 4154.52] + - [713, 4154.52] - - [4096, 1024, 1, 1920] - - [480, 9535.32] + - [658, 9535.32] - - [4096, 200, 1, 2560] - - [585, 6754.65] + - [763, 6754.65] - - [1024, 256, 1, 16384] - - [475, 6289.6] + - [653, 6289.6] - - [1024, 1024, 1, 1152] - - [578, 8407.39] + - [756, 8407.39] - - [2048, 200, 1, 32] - - [521, 1412.51] + - [699, 1412.51] - - [512, 1024, 1, 2816] - - [572, 7843.25] + - [750, 7843.25] - - [4096, 256, 1, 14336] - - [584, 8844.77] + - [762, 8844.77] - - [1024, 200, 1, 4608] - - [583, 4931.74] + - [761, 4931.74] - - [1024, 200, 1, 16384] - - [478, 5135.15] + - [656, 5135.15] - - [64, 256, 1, 1024] - - [556, 461.013] + - [734, 461.013] - - [1, 200, 1, 1024] - - [538, 7.49884] + - [716, 7.49884] - - [2048, 200, 1, 2080] - - [583, 6033.87] + - [761, 6033.87] - - [512, 256, 1, 1792] - - [493, 3153.71] + - [671, 3153.71] - - [2048, 200, 1, 1024] - - [583, 5711.3] + - [761, 5711.3] - - [4096, 1024, 1, 12288] - - [480, 9658.23] + - [658, 9658.23] - - [4096, 200, 1, 4096] - - [574, 6834.55] + - [752, 6834.55] - - [1024, 512, 1, 11264] - - [541, 7686.46] + - [719, 7686.46] - - [128, 512, 1, 1024] - - [511, 1458.99] + - [689, 1458.99] - - [32, 256, 1, 2048] - - [529, 384.899] + - [707, 384.899] - - [1024, 200, 1, 1792] - - [583, 4638.64] + - [761, 4638.64] - - [1024, 1024, 1, 1792] - - [578, 8550.56] + - [756, 8550.56] - - [32, 256, 1, 512] - - [562, 161.419] + - [740, 161.419] - - [512, 200, 1, 2816] - - [488, 3353.1] + - [666, 3353.1] - - [512, 200, 1, 3072] - - [473, 3298.89] + - [651, 3298.89] - - [1024, 1024, 1, 8192] - - [519, 8369.1] + - [697, 8369.1] - - [1024, 256, 1, 12288] - - [476, 6475.71] + - [654, 6475.71] - - [4096, 200, 1, 768] - - [578, 6367.97] + - [756, 6367.97] - - [1024, 512, 1, 16384] - - [594, 7367.12] + - [772, 7367.12] - - [4096, 256, 1, 1024] - - [574, 8214.16] + - [752, 8214.16] - - [1024, 512, 1, 256] - - [577, 5537.13] + - [755, 5537.13] - - [4096, 1024, 1, 8320] - - [480, 9674.26] + - [658, 9674.26] - - [4096, 256, 1, 9216] - - [582, 8791.02] + - [760, 8791.02] - - [1024, 512, 1, 1408] - - [572, 7459.65] + - [750, 7459.65] - - [1024, 512, 1, 5632] - - [583, 7997.91] + - [761, 7997.91] - - [4096, 200, 1, 256] - - [588, 5371.9] + - [766, 5371.9] - - [1024, 200, 1, 128] - - [566, 1998.15] + - [744, 1998.15] - - [256, 200, 1, 1024] - - [535, 1196.01] + - [713, 1196.01] - - [1024, 200, 1, 5120] - - [583, 4957.44] + - [761, 4957.44] - - [512, 1024, 1, 3072] - - [596, 7104.07] + - [774, 7104.07] - - [4096, 1024, 1, 15360] - - [480, 9669.04] + - [658, 9669.04] - - [1, 256, 1, 2048] - - [522, 13.9262] + - [700, 13.9262] - - [1024, 1024, 1, 4160] - - [574, 8759.3] + - [752, 8759.3] - - [1024, 256, 1, 256] - - [581, 3728.37] + - [759, 3728.37] - - [2048, 256, 1, 384] - - [583, 6123.17] + - [761, 6123.17] - - [512, 256, 1, 2560] - - [537, 3809.64] + - [715, 3809.64] - - [4096, 512, 1, 3072] - - [485, 9215.19] + - [663, 9215.19] - - [1024, 256, 1, 4160] - - [572, 6293.49] + - [750, 6293.49] - - [4096, 512, 1, 13312] - - [482, 9367.32] + - [660, 9367.32] - - [4096, 1024, 1, 3840] - - [480, 9631.57] + - [658, 9631.57] - - [4096, 200, 1, 640] - - [578, 6206.16] + - [756, 6206.16] - - [32, 200, 1, 2048] - - [516, 303.507] + - [694, 303.507] - - [1024, 200, 1, 512] - - [572, 3713.19] + - [750, 3713.19] - - [1024, 1024, 1, 7168] - - [575, 8475.74] + - [753, 8475.74] - - [2048, 1024, 1, 3200] - - [480, 9271.34] + - [658, 9271.34] - - [512, 512, 1, 1536] - - [583, 5832.27] + - [761, 5832.27] - - [4096, 256, 1, 768] - - [588, 8066.07] + - [766, 8066.07] - - [2048, 256, 1, 6656] - - [572, 8034.87] + - [750, 8034.87] - - [1024, 256, 1, 896] - - [572, 5467.54] + - [750, 5467.54] - - [2048, 256, 1, 512] - - [583, 6465.31] + - [761, 6465.31] - - [2048, 200, 1, 3072] - - [583, 6165.78] + - [761, 6165.78] - - [128, 200, 1, 1024] - - [540, 692.87] + - [718, 692.87] - - [4096, 512, 1, 3840] - - [485, 9272.7] + - [663, 9272.7] - - [1024, 200, 1, 3200] - - [583, 4838.85] + - [761, 4838.85] - - [4096, 512, 1, 5632] - - [480, 9335.52] + - [658, 9335.52] - - [4096, 512, 1, 64] - - [515, 5275.95] + - [693, 5275.95] - - [1024, 512, 1, 2816] - - [572, 7816.68] + - [750, 7816.68] - - [4096, 256, 1, 7680] - - [578, 8795.5] + - [756, 8795.5] - - [4096, 200, 1, 1024] - - [588, 6448.91] + - [766, 6448.91] - - [1024, 512, 1, 12288] - - [542, 7624.67] + - [720, 7624.67] - - [2048, 1024, 1, 512] - - [485, 8436.16] + - [663, 8436.16] - - [128, 256, 1, 2048] - - [559, 1342.28] + - [737, 1342.28] - - [2048, 200, 1, 1792] - - [583, 6020.47] + - [761, 6020.47] - - [1024, 1024, 1, 2816] - - [574, 8670.5] + - [752, 8670.5] - - [2048, 512, 1, 1536] - - [585, 8466.32] + - [763, 8466.32] - - [4096, 256, 1, 3072] - - [582, 8631.47] + - [760, 8631.47] - - [1024, 200, 1, 1536] - - [564, 4577.7] + - [742, 4577.7] - - [1024, 256, 1, 1024] - - [572, 5491.82] + - [750, 5491.82] - - [4096, 512, 1, 8192] - - [485, 9325.64] + - [663, 9325.64] - - [128, 1024, 1, 512] - - [583, 2534.42] + - [761, 2534.42] - - [4096, 512, 1, 2304] - - [480, 9193.09] + - [658, 9193.09] - - [2048, 256, 1, 5632] - - [583, 7999.64] + - [761, 7999.64] - - [1024, 256, 1, 5120] - - [583, 6307.32] + - [761, 6307.32] - - [1024, 512, 1, 6656] - - [583, 8028.95] + - [761, 8028.95] - - [4096, 512, 1, 2816] - - [480, 9234.5] + - [658, 9234.5] - - [4096, 200, 1, 2080] - - [567, 6697.96] + - [745, 6697.96] - - [1024, 200, 1, 2304] - - [583, 4752.91] + - [761, 4752.91] - - [2048, 200, 1, 13312] - - [572, 6346.23] + - [750, 6346.23] - - [64, 1024, 1, 1024] - - [556, 1359.68] + - [734, 1359.68] - - [4096, 256, 1, 3584] - - [578, 8668.9] + - [756, 8668.9] - - [2048, 1024, 1, 7680] - - [480, 9365.88] + - [658, 9365.88] - - [1024, 256, 1, 1664] - - [572, 5907.57] + - [750, 5907.57] - - [1, 512, 1, 2048] - - [499, 23.5057] + - [677, 23.5057] - - [512, 512, 1, 1024] - - [572, 5360.23] + - [750, 5360.23] - - [2048, 256, 1, 8192] - - [544, 7665.31] + - [722, 7665.31] - - [2048, 512, 1, 512] - - [574, 7767.33] + - [752, 7767.33] - - [4096, 512, 1, 1920] - - [480, 9133.04] + - [658, 9133.04] - - [4096, 200, 1, 12288] - - [588, 6910.75] + - [766, 6910.75] - - [1024, 512, 1, 3072] - - [518, 7310.43] + - [696, 7310.43] - - [2048, 512, 1, 1152] - - [578, 8342.36] + - [756, 8342.36] - - [1024, 256, 1, 2080] - - [572, 6010.46] + - [750, 6010.46] - - [4096, 1024, 1, 32] - - [568, 4793.59] + - [746, 4793.59] - - [4096, 512, 1, 16640] - - [480, 9365.41] + - [658, 9365.41] - - [2048, 200, 1, 9216] - - [572, 6315.98] + - [750, 6315.98] - - [2048, 200, 1, 2560] - - [572, 6119.24] + - [750, 6119.24] - - [2048, 1024, 1, 1024] - - [480, 8628.69] + - [658, 8628.69] - - [2048, 256, 1, 4608] - - [572, 7951.39] + - [750, 7951.39] - - [512, 200, 1, 768] - - [524, 2132.51] + - [702, 2132.51] - - [128, 256, 1, 512] - - [524, 670.117] + - [702, 670.117] - - [4096, 512, 1, 1792] - - [485, 9127.01] + - [663, 9127.01] - - [4096, 1024, 1, 8192] - - [480, 9591.37] + - [658, 9591.37] - - [1024, 256, 1, 2816] - - [583, 6119.11] + - [761, 6119.11] - - [1024, 1024, 1, 13312] - - [575, 8529.37] + - [753, 8529.37] - - [2048, 1024, 1, 4160] - - [480, 9305.67] + - [658, 9305.67] - - [2048, 256, 1, 3584] - - [572, 7903.23] + - [750, 7903.23] - - [128, 200, 1, 2048] - - [540, 1135.91] + - [718, 1135.91] - - [4096, 512, 1, 10240] - - [482, 9339.59] + - [660, 9339.59] - - [4096, 512, 1, 512] - - [480, 8446.78] + - [658, 8446.78] - - [2048, 1024, 1, 6656] - - [480, 9331.75] + - [658, 9331.75] - - [1024, 512, 1, 640] - - [572, 6776.04] + - [750, 6776.04] - - [2048, 512, 1, 768] - - [574, 8085.51] + - [752, 8085.51] - - [2048, 200, 1, 1408] - - [572, 5880.17] + - [750, 5880.17] - - [4096, 200, 1, 2048] - - [588, 6691.71] + - [766, 6691.71] - - [1024, 1024, 1, 5632] - - [574, 8749.63] + - [752, 8749.63] - - [2048, 512, 1, 3584] - - [578, 8704.23] + - [756, 8704.23] - - [64, 512, 1, 512] - - [514, 667.983] + - [692, 667.983] - - [64, 200, 1, 512] - - [524, 251.388] + - [702, 251.388] - - [1024, 200, 1, 64] - - [479, 1310.82] + - [657, 1310.82] - - [512, 512, 1, 2304] - - [572, 6078.8] + - [750, 6078.8] - - [2048, 1024, 1, 14336] - - [480, 9321.94] + - [658, 9321.94] - - [4096, 512, 1, 11264] - - [482, 9339.95] + - [660, 9339.95] - - [4096, 512, 1, 128] - - [567, 6566.53] + - [745, 6566.53] - - [1024, 512, 1, 64] - - [587, 2953.84] + - [765, 2953.84] - - [4096, 512, 1, 768] - - [480, 8738.23] + - [658, 8738.23] - - [4096, 1024, 1, 11264] - - [480, 9637.78] + - [658, 9637.78] - - [1, 256, 1, 1024] - - [570, 8.93234] + - [748, 8.93234] - - [4096, 200, 1, 7680] - - [567, 6889.57] + - [745, 6889.57] - - [1024, 200, 1, 12288] - - [539, 5237.74] + - [717, 5237.74] - - [1024, 1024, 1, 1280] - - [574, 8418.17] + - [752, 8418.17] - - [4096, 1024, 1, 16640] - - [480, 9675.01] + - [658, 9675.01] - - [2048, 1024, 1, 5632] - - [480, 9327.85] + - [658, 9327.85] - - [1024, 200, 1, 15360] - - [539, 5386.63] + - [717, 5386.63] - - [1, 1024, 1, 1024] - - [589, 27.3499] + - [767, 27.3499] - - [2048, 256, 1, 16384] - - [550, 7652.75] + - [728, 7652.75] - - [4096, 512, 1, 12288] - - [482, 9359.51] + - [660, 9359.51] - - [2048, 200, 1, 896] - - [583, 5628.96] + - [761, 5628.96] - - [4096, 1024, 1, 5632] - - [480, 9626.78] + - [658, 9626.78] - - [2048, 256, 1, 32] - - [576, 1889.43] + - [754, 1889.43] - - [2048, 256, 1, 1280] - - [572, 7390.94] + - [750, 7390.94] - - [4096, 256, 1, 4096] - - [574, 8694.37] + - [752, 8694.37] - - [2048, 256, 1, 11264] - - [572, 8113.95] + - [750, 8113.95] - - [4096, 200, 1, 9216] - - [574, 6891.08] + - [752, 6891.08] - - [1024, 512, 1, 4096] - - [520, 7348.46] + - [698, 7348.46] - - [2048, 1024, 1, 10240] - - [482, 9095.91] + - [660, 9095.91] - - [4096, 1024, 1, 640] - - [480, 9115.68] + - [658, 9115.68] - - [128, 1024, 1, 2048] - - [473, 3270.51] + - [651, 3270.51] - - [4096, 200, 1, 3840] - - [567, 6836.26] + - [745, 6836.26] - - [1024, 1024, 1, 1920] - - [578, 8562.82] + - [756, 8562.82] - - [2048, 200, 1, 7168] - - [583, 6296.23] + - [761, 6296.23] - - [2048, 512, 1, 16384] - - [474, 8632.51] + - [652, 8632.51] - - [2048, 1024, 1, 12288] - - [480, 9158.08] + - [658, 9158.08] - - [4096, 1024, 1, 10240] - - [480, 9658.84] + - [658, 9658.84] - - [1024, 1024, 1, 8320] - - [582, 8799.58] + - [760, 8799.58] - - [1024, 256, 1, 9216] - - [572, 6375.23] + - [750, 6375.23] - - [4096, 256, 1, 1152] - - [567, 8301.09] + - [745, 8301.09] - - [512, 200, 1, 2560] - - [533, 3088.51] + - [711, 3088.51] - - [2048, 256, 1, 1920] - - [572, 7714.94] + - [750, 7714.94] - - [2048, 1024, 1, 4608] - - [480, 9305.7] + - [658, 9305.7] - - [512, 256, 1, 1024] - - [580, 2887.74] + - [758, 2887.74] - - [1024, 256, 1, 1920] - - [564, 5913.12] + - [742, 5913.12] - - [4096, 512, 1, 3584] - - [480, 9275.69] + - [658, 9275.69] - - [2048, 512, 1, 4160] - - [585, 8734.03] + - [763, 8734.03] - - [2048, 512, 1, 5632] - - [588, 8758.98] + - [766, 8758.98] - - [4096, 1024, 1, 4608] - - [480, 9657.22] + - [658, 9657.22] - - [4096, 1024, 1, 3328] - - [480, 9621.45] + - [658, 9621.45] - - [4096, 256, 1, 7168] - - [574, 8770.05] + - [752, 8770.05] - - [4096, 200, 1, 128] - - [588, 4458.33] + - [766, 4458.33] - - [2048, 200, 1, 5120] - - [572, 6176.91] + - [750, 6176.91] - - [1024, 1024, 1, 6656] - - [574, 8780.45] + - [752, 8780.45] - - [512, 1024, 1, 3200] - - [583, 7887.09] + - [761, 7887.09] - - [512, 200, 1, 2304] - - [473, 2991.09] + - [651, 2991.09] - - [2048, 1024, 1, 9216] - - [485, 9325.46] + - [663, 9325.46] - - [2048, 256, 1, 1536] - - [583, 7551.73] + - [761, 7551.73] - - [4096, 256, 1, 256] - - [588, 6932.83] + - [766, 6932.83] - - [2048, 512, 1, 1408] - - [585, 8430.86] + - [763, 8430.86] - - [1024, 256, 1, 384] - - [577, 4462.13] + - [755, 4462.13] - - [2048, 1024, 1, 2304] - - [480, 9174.94] + - [658, 9174.94] - - [4096, 512, 1, 6144] - - [482, 9284.25] + - [660, 9284.25] - - [1024, 200, 1, 14336] - - [471, 5268.57] + - [649, 5268.57] - - [1024, 512, 1, 2080] - - [583, 7736.47] + - [761, 7736.47] - - [2048, 512, 1, 2304] - - [585, 8616.07] + - [763, 8616.07] - - [4096, 512, 1, 15360] - - [485, 9362.17] + - [663, 9362.17] - - [1024, 256, 1, 32] - - [505, 1028.12] + - [683, 1028.12] - - [1024, 200, 1, 2816] - - [583, 4780.58] + - [761, 4780.58] - - [4096, 200, 1, 512] - - [574, 6054.23] + - [752, 6054.23] - - [4096, 1024, 1, 7168] - - [485, 9468.49] + - [663, 9468.49] - - [2048, 256, 1, 14336] - - [546, 7865.52] + - [724, 7865.52] - - [1024, 200, 1, 3072] - - [583, 4804.2] + - [761, 4804.2] - - [2048, 200, 1, 1280] - - [583, 5846.31] + - [761, 5846.31] - - [1024, 1024, 1, 2304] - - [574, 8633.32] + - [752, 8633.32] - - [4096, 1024, 1, 9216] - - [480, 9641.03] + - [658, 9641.03] - - [2048, 512, 1, 4608] - - [585, 8743.3] + - [763, 8743.3] - - [4096, 1024, 1, 7680] - - [480, 9684.86] + - [658, 9684.86] - - [4096, 256, 1, 6144] - - [585, 8757.24] + - [763, 8757.24] - - [4096, 256, 1, 896] - - [578, 8258.93] + - [756, 8258.93] - - [512, 256, 1, 1536] - - [562, 3065.36] + - [740, 3065.36] - - [1024, 256, 1, 512] - - [572, 4752.85] + - [750, 4752.85] - - [2048, 256, 1, 640] - - [572, 6776.04] + - [750, 6776.04] - - [256, 256, 1, 2048] - - [509, 2249.06] + - [687, 2249.06] - - [2048, 1024, 1, 8192] - - [480, 9178.17] + - [658, 9178.17] - - [4096, 200, 1, 16640] - - [472, 7009.59] + - [650, 7009.59] - - [256, 512, 1, 512] - - [484, 2511.66] + - [662, 2511.66] - - [2048, 512, 1, 384] - - [585, 7467.7] + - [763, 7467.7] - - [2048, 200, 1, 16384] - - [553, 6327.31] + - [731, 6327.31] - - [4096, 200, 1, 10240] - - [578, 6892.74] + - [756, 6892.74] - - [1024, 512, 1, 9216] - - [527, 7530.09] + - [705, 7530.09] - - [4096, 1024, 1, 64] - - [502, 6260.26] + - [680, 6260.26] - - [4096, 200, 1, 1920] - - [588, 6710.27] + - [766, 6710.27] - - [2048, 1024, 1, 1280] - - [480, 8998.34] + - [658, 8998.34] - - [1024, 200, 1, 3840] - - [572, 4873.87] + - [750, 4873.87] - - [256, 1024, 1, 512] - - [583, 4766.35] + - [761, 4766.35] - - [2048, 1024, 1, 3328] - - [480, 9275.2] + - [658, 9275.2] - - [1024, 256, 1, 16640] - - [537, 6837.22] + - [715, 6837.22] - - [4096, 512, 1, 14336] - - [485, 9354.42] + - [663, 9354.42] - - [1024, 1024, 1, 16640] - - [582, 8832.37] + - [760, 8832.37] - - [1024, 256, 1, 1152] - - [583, 5642.66] + - [761, 5642.66] - - [512, 512, 1, 512] - - [572, 4779.93] + - [750, 4779.93] - - [4096, 512, 1, 8320] - - [485, 9327.96] + - [663, 9327.96] - - [2048, 512, 1, 7680] - - [588, 8793.96] + - [766, 8793.96] - - [4096, 1024, 1, 6656] - - [480, 9667.03] + - [658, 9667.03] - - [1024, 512, 1, 3584] - - [583, 7900.57] + - [761, 7900.57] - - [1024, 1024, 1, 32] - - [568, 2974.78] + - [746, 2974.78] - - [512, 512, 1, 2816] - - [564, 6155.85] + - [742, 6155.85] - - [2048, 512, 1, 1664] - - [588, 8496.55] + - [766, 8496.55] - - [1024, 1024, 1, 14336] - - [474, 8624.74] + - [652, 8624.74] - - [2048, 200, 1, 2048] - - [583, 6029.86] + - [761, 6029.86] - - [1024, 1024, 1, 3584] - - [574, 8702.62] + - [752, 8702.62] - - [512, 200, 1, 1280] - - [488, 2350.75] + - [666, 2350.75] - - [4096, 256, 1, 6656] - - [588, 8788.41] + - [766, 8788.41] - - [4096, 256, 1, 4160] - - [565, 8728.44] + - [743, 8728.44] - - [128, 256, 1, 1024] - - [547, 859.589] + - [725, 859.589] - - [512, 200, 1, 3200] - - [488, 3376.85] + - [666, 3376.85] - - [2048, 512, 1, 9216] - - [571, 8806.4] + - [749, 8806.4] - - [2048, 1024, 1, 256] - - [567, 7713.76] + - [745, 7713.76] - - [1024, 256, 1, 2304] - - [583, 6015.83] + - [761, 6015.83] - - [1024, 200, 1, 8192] - - [583, 5022.02] + - [761, 5022.02] - - [2048, 256, 1, 3072] - - [500, 7515.09] + - [678, 7515.09] - - [2048, 256, 1, 8320] - - [572, 8063.68] + - [750, 8063.68] - - [4096, 512, 1, 1024] - - [482, 8824.41] + - [660, 8824.41] - - [1024, 512, 1, 3200] - - [572, 7866.39] + - [750, 7866.39] - - [1024, 512, 1, 896] - - [564, 7161.11] + - [742, 7161.11] - - [2048, 512, 1, 1280] - - [578, 8384.52] + - [756, 8384.52] - - [4096, 200, 1, 64] - - [487, 3260.6] + - [665, 3260.6] - - [1024, 256, 1, 6144] - - [593, 6143.72] + - [771, 6143.72] - - [1024, 200, 1, 2560] - - [572, 4762.89] + - [750, 4762.89] - - [1024, 1024, 1, 5120] - - [501, 8454.23] + - [679, 8454.23] - - [2048, 512, 1, 6656] - - [578, 8799.05] + - [756, 8799.05] - - [4096, 1024, 1, 1536] - - [480, 9503.37] + - [658, 9503.37] - - [1024, 1024, 1, 128] - - [503, 5825.52] + - [681, 5825.52] - - [512, 1024, 1, 1792] - - [572, 7701.12] + - [750, 7701.12] - - [2048, 1024, 1, 32] - - [483, 3938.41] + - [661, 3938.41] - - [4096, 256, 1, 2816] - - [567, 8652.2] + - [745, 8652.2] - - [1024, 1024, 1, 15360] - - [474, 8719.7] + - [652, 8719.7] - - [1024, 256, 1, 5632] - - [572, 6344.18] + - [750, 6344.18] - - [1024, 1024, 1, 4096] - - [575, 8187.86] + - [753, 8187.86] - - [2048, 200, 1, 4160] - - [583, 6222.48] + - [761, 6222.48] - - [512, 256, 1, 768] - - [514, 2771.67] + - [692, 2771.67] - - [4096, 512, 1, 640] - - [485, 8590.58] + - [663, 8590.58] - - [2048, 512, 1, 8192] - - [527, 8494.9] + - [705, 8494.9] - - [1024, 512, 1, 768] - - [572, 7049.35] + - [750, 7049.35] - - [4096, 200, 1, 8320] - - [567, 6908.7] + - [745, 6908.7] - - [2048, 512, 1, 896] - - [574, 8224.23] + - [752, 8224.23] - - [4096, 200, 1, 7168] - - [585, 6878.59] + - [763, 6878.59] - - [2048, 512, 1, 13312] - - [573, 8803.04] + - [751, 8803.04] - - [64, 512, 1, 1024] - - [477, 844.024] + - [655, 844.024] - - [2048, 200, 1, 3840] - - [572, 6192.48] + - [750, 6192.48] - - [1024, 1024, 1, 768] - - [565, 8098.51] + - [743, 8098.51] - - [4096, 512, 1, 16384] - - [485, 9345.73] + - [663, 9345.73] - - [4096, 256, 1, 2304] - - [565, 8596.45] + - [743, 8596.45] - - [1, 256, 1, 4096] - - [570, 19.9293] + - [748, 19.9293] - - [1024, 1024, 1, 11264] - - [575, 8491.48] + - [753, 8491.48] - - [2048, 200, 1, 16640] - - [569, 6510.64] + - [747, 6510.64] - - [1024, 256, 1, 3072] - - [583, 6179.55] + - [761, 6179.55] - - [4096, 1024, 1, 512] - - [480, 9032.25] + - [658, 9032.25] - - [2048, 256, 1, 2816] - - [572, 7793.57] + - [750, 7793.57] - - [32, 512, 1, 512] - - [484, 318.816] + - [662, 318.816] - - [256, 512, 1, 2048] - - [535, 3369.02] + - [713, 3369.02] - - [1024, 512, 1, 384] - - [583, 6198.58] + - [761, 6198.58] - - [2048, 200, 1, 7680] - - [572, 6307.7] + - [750, 6307.7] - - [1024, 512, 1, 4608] - - [583, 7953.48] - - - [2048, 256, 1, 768] - - [583, 7059.24] + - [761, 7953.48] - - [4096, 200, 1, 32] - - [532, 2199.29] + - [710, 2199.29] - - [4096, 200, 1, 3328] - - [567, 6813.12] + - [745, 6813.12] - - [1024, 200, 1, 1152] - - [572, 4375.65] + - [750, 4375.65] - - [1024, 1024, 1, 1408] - - [574, 8457.91] + - [752, 8457.91] - - [2048, 200, 1, 15360] - - [548, 6333.1] + - [726, 6333.1] - - [512, 1024, 1, 2048] - - [558, 6280.76] + - [736, 6280.76] - - [1024, 512, 1, 1024] - - [583, 7064.19] + - [761, 7064.19] - - [1024, 200, 1, 10240] - - [572, 5030.69] + - [750, 5030.69] - - [4096, 256, 1, 5632] - - [585, 8765.22] + - [763, 8765.22] - - [512, 512, 1, 3072] - - [595, 5942.44] + - [773, 5942.44] - - [2048, 256, 1, 1408] - - [572, 7545.05] + - [750, 7545.05] - - [2048, 256, 1, 6144] - - [583, 7963.97] + - [761, 7963.97] - - [4096, 256, 1, 3328] - - [578, 8682.58] + - [756, 8682.58] - - [1024, 200, 1, 1664] - - [572, 4595.4] + - [750, 4595.4] - - [2048, 1024, 1, 1152] - - [480, 8942.65] + - [658, 8942.65] - - [2048, 512, 1, 6144] - - [573, 8729.71] + - [751, 8729.71] - - [2048, 512, 1, 3200] - - [574, 8696.56] + - [752, 8696.56] - - [4096, 1024, 1, 2080] - - [513, 9538.45] + - [691, 9538.45] - - [4096, 1024, 1, 768] - - [480, 9260.75] + - [658, 9260.75] - - [4096, 1024, 1, 2560] - - [480, 9567.27] + - [658, 9567.27] - - [64, 200, 1, 2048] - - [512, 583.161] + - [690, 583.161] - - [2048, 200, 1, 4608] - - [583, 6243.28] + - [761, 6243.28] - - [1024, 1024, 1, 6144] - - [575, 8320.25] + - [753, 8320.25] - - [4096, 256, 1, 1664] - - [578, 8503.17] + - [756, 8503.17] - - [2048, 200, 1, 384] - - [583, 4940.0] + - [761, 4940.0] - - [1, 200, 1, 2048] - - [529, 11.3281] + - [707, 11.3281] - - [4096, 256, 1, 1792] - - [588, 8504.12] + - [766, 8504.12] - - [2048, 1024, 1, 64] - - [502, 5309.35] + - [680, 5309.35] - - [4096, 1024, 1, 16384] - - [469, 9428.61] + - [647, 9428.61] - - [1024, 512, 1, 16640] - - [583, 8122.55] + - [761, 8122.55] - - [2048, 512, 1, 10240] - - [573, 8766.21] + - [751, 8766.21] - - [4096, 512, 1, 6656] - - [480, 9351.75] + - [658, 9351.75] - - [2048, 256, 1, 16640] - - [572, 8135.27] + - [750, 8135.27] - - [2048, 512, 1, 2816] - - [574, 8660.32] + - [752, 8660.32] - - [1024, 200, 1, 32] - - [492, 780.291] + - [670, 780.291] - - [1, 512, 1, 4096] - - [517, 34.8671] + - [695, 34.8671] - - [256, 256, 1, 1024] - - [524, 1490.08] + - [702, 1490.08] - - [2048, 1024, 1, 128] - - [497, 6605.3] + - [675, 6605.3] - - [2048, 1024, 1, 2080] - - [480, 9159.51] + - [658, 9159.51] - - [2048, 1024, 1, 16640] - - [480, 9371.65] + - [658, 9371.65] - - [1024, 200, 1, 384] - - [583, 3378.24] + - [761, 3378.24] - - [4096, 256, 1, 384] - - [528, 7369.3] + - [706, 7369.3] - - [4096, 256, 1, 13312] - - [582, 8776.48] + - [760, 8776.48] - - [2048, 256, 1, 128] - - [577, 4280.0] + - [755, 4280.0] - - [512, 256, 1, 2304] - - [489, 3584.98] + - [667, 3584.98] - - [2048, 1024, 1, 3072] - - [482, 9156.52] + - [660, 9156.52] - - [1024, 1024, 1, 640] - - [578, 7928.84] + - [756, 7928.84] - - [256, 512, 1, 1024] - - [583, 2843.7] + - [761, 2843.7] - - [4096, 1024, 1, 1408] - - [480, 9437.56] + - [658, 9437.56] - - [4096, 200, 1, 5632] - - [585, 6873.96] + - [763, 6873.96] - - [4096, 1024, 1, 2048] - - [480, 9437.1] + - [658, 9437.1] - - [2048, 1024, 1, 2560] - - [485, 9195.62] + - [663, 9195.62] - - [4096, 1024, 1, 128] - - [567, 7407.26] + - [745, 7407.26] - - [1024, 200, 1, 3328] - - [583, 4857.39] + - [761, 4857.39] - - [2048, 200, 1, 1152] - - [572, 5760.1] + - [750, 5760.1] - - [1024, 200, 1, 9216] - - [471, 5053.21] + - [649, 5053.21] - - [4096, 256, 1, 512] - - [565, 7617.45] + - [743, 7617.45] - - [4096, 1024, 1, 14336] - - [480, 9665.12] + - [658, 9665.12] - - [1024, 1024, 1, 384] - - [503, 7478.8] + - [681, 7478.8] - - [2048, 200, 1, 512] - - [572, 5150.28] + - [750, 5150.28] - - [2048, 256, 1, 9216] - - [551, 7717.71] + - [729, 7717.71] - - [2048, 256, 1, 1792] - - [572, 7655.94] + - [750, 7655.94] - - [4096, 512, 1, 9216] - - [482, 9331.22] + - [660, 9331.22] - - [4096, 200, 1, 15360] - - [472, 6958.14] + - [650, 6958.14] - - [1024, 512, 1, 2048] - - [571, 7067.91] + - [749, 7067.91] - - [64, 256, 1, 2048] - - [496, 723.256] + - [674, 723.256] - - [4096, 200, 1, 1792] - - [574, 6699.65] + - [752, 6699.65] - - [1, 200, 1, 4096] - - [506, 15.6387] + - [684, 15.6387] - - [2048, 1024, 1, 2048] - - [485, 9071.93] + - [663, 9071.93] - - [1024, 200, 1, 2080] - - [564, 4679.19] + - [742, 4679.19] - - [2048, 200, 1, 1536] - - [583, 5939.92] + - [761, 5939.92] - - [1024, 1024, 1, 3072] - - [545, 8333.15] + - [723, 8333.15] - - [512, 200, 1, 1792] - - [470, 2679.73] + - [648, 2679.73] - - [1024, 256, 1, 11264] - - [473, 6470.98] + - [651, 6470.98] - - [2048, 512, 1, 12288] - - [520, 8729.24] + - [698, 8729.24] - - [1024, 256, 1, 1792] - - [583, 5931.44] + - [761, 5931.44] - - [1024, 200, 1, 7168] - - [583, 4970.33] + - [761, 4970.33] - - [32, 256, 1, 1024] - - [494, 237.334] + - [672, 237.334] - - [512, 256, 1, 3072] - - [537, 3813.1] + - [715, 3813.1] - - [1024, 1024, 1, 2080] - - [574, 8600.41] + - [752, 8600.41] - - [2048, 200, 1, 2304] - - [583, 6093.32] + - [761, 6093.32] - - [4096, 512, 1, 1536] - - [480, 9075.0] + - [658, 9075.0] - - [2048, 256, 1, 7168] - - [583, 7895.26] + - [761, 7895.26] - - [2048, 512, 1, 1792] - - [585, 8531.92] + - [763, 8531.92] - - [1024, 200, 1, 2048] - - [572, 4685.43] + - [750, 4685.43] - - [1024, 1024, 1, 4608] - - [578, 8735.71] + - [756, 8735.71] - - [4096, 256, 1, 8192] - - [574, 8782.55] + - [752, 8782.55] - - [512, 1024, 1, 1280] - - [564, 7483.25] + - [742, 7483.25] - - [2048, 1024, 1, 16384] - - [474, 8878.96] + - [652, 8878.96] - - [512, 512, 1, 1280] - - [572, 5745.72] + - [750, 5745.72] - - [1024, 200, 1, 1280] - - [564, 4446.23] - - - [4096, 512, 1, 4096] - - [482, 9264.49] + - [742, 4446.23] - - [2048, 256, 1, 3200] - - [572, 7842.85] + - [750, 7842.85] - - [2048, 512, 1, 15360] - - [520, 8757.24] + - [698, 8757.24] - - [1024, 512, 1, 3328] - - [572, 7854.04] + - [750, 7854.04] - - [1024, 512, 1, 4160] - - [572, 7934.61] + - [750, 7934.61] - - [4096, 200, 1, 6656] - - [574, 6883.3] + - [752, 6883.3] - - [4096, 1024, 1, 1024] - - [480, 9229.44] + - [658, 9229.44] - - [2048, 200, 1, 3328] - - [583, 6182.74] + - [761, 6182.74] - - [1024, 1024, 1, 256] - - [503, 6932.83] + - [681, 6932.83] - - [512, 200, 1, 512] - - [524, 1910.77] + - [702, 1910.77] - - [2048, 256, 1, 64] - - [495, 2912.81] + - [673, 2912.81] - - [1024, 256, 1, 2560] - - [572, 6123.17] + - [750, 6123.17] - - [2048, 512, 1, 11264] - - [584, 8728.94] + - [762, 8728.94] - - [32, 200, 1, 1024] - - [579, 187.56] + - [757, 187.56] - - [32, 512, 1, 2048] - - [523, 694.521] + - [701, 694.521] - - [2048, 256, 1, 2304] - - [572, 7759.35] + - [750, 7759.35] - - [2048, 256, 1, 12288] - - [551, 7726.35] + - [729, 7726.35] - - [4096, 200, 1, 8192] - - [574, 6870.94] + - [752, 6870.94] - - [1024, 512, 1, 7168] - - [520, 7479.2] + - [698, 7479.2] - - [1024, 512, 1, 1792] - - [572, 7626.11] + - [750, 7626.11] - - [4096, 1024, 1, 1664] - - [480, 9503.54] + - [658, 9503.54] - - [4096, 200, 1, 2816] - - [567, 6775.44] + - [745, 6775.44] - - [1024, 1024, 1, 896] - - [574, 8229.99] + - [752, 8229.99] - - [1024, 200, 1, 8320] - - [535, 5173.58] + - [713, 5173.58] - - [1024, 1024, 1, 12288] - - [575, 8463.21] + - [753, 8463.21] - - [1024, 256, 1, 8320] - - [564, 6404.37] + - [742, 6404.37] - - [1024, 200, 1, 1024] - - [572, 4297.54] + - [750, 4297.54] - - [1024, 200, 1, 16640] - - [534, 5499.51] + - [712, 5499.51] - - [4096, 256, 1, 5120] - - [588, 8729.15] + - [766, 8729.15] - - [1024, 256, 1, 3200] - - [583, 6124.96] + - [761, 6124.96] - - [512, 512, 1, 2560] - - [583, 6109.79] + - [761, 6109.79] - - [4096, 256, 1, 2048] - - [588, 8511.05] + - [766, 8511.05] - - [1024, 256, 1, 640] - - [572, 5102.66] + - [750, 5102.66] - - [2048, 256, 1, 5120] - - [500, 7667.93] + - [678, 7667.93] - - [2048, 256, 1, 7680] - - [583, 8054.45] + - [761, 8054.45] - - [4096, 512, 1, 384] - - [578, 8190.77] + - [756, 8190.77] - - [2048, 200, 1, 3584] - - [572, 6166.12] + - [750, 6166.12] - - [1024, 512, 1, 1536] - - [572, 7517.9] + - [750, 7517.9] - - [4096, 512, 1, 3328] - - [480, 9259.45] + - [658, 9259.45] - - [4096, 1024, 1, 256] - - [480, 8341.79] + - [658, 8341.79] - - [2048, 200, 1, 64] - - [543, 2307.71] + - [721, 2307.71] - - [2048, 200, 1, 4096] - - [583, 6212.04] + - [761, 6212.04] - - [1024, 1024, 1, 1536] - - [574, 8484.15] + - [752, 8484.15] - - [2048, 1024, 1, 7168] - - [482, 9315.24] + - [660, 9315.24] - - [1024, 256, 1, 3584] - - [572, 6207.32] + - [750, 6207.32] - - [4096, 256, 1, 32] - - [576, 2892.72] + - [754, 2892.72] - - [4096, 256, 1, 1280] - - [585, 8392.9] + - [763, 8392.9] - - [512, 512, 1, 3200] - - [583, 6219.41] + - [761, 6219.41] - - [2048, 1024, 1, 1536] - - [482, 9052.55] + - [660, 9052.55] - - [2048, 256, 1, 1024] - - [572, 7192.9] + - [750, 7192.9] - - [128, 200, 1, 512] - - [562, 502.677] + - [740, 502.677] - - [4096, 512, 1, 7168] - - [485, 9329.11] + - [663, 9329.11] - - [1024, 512, 1, 1152] - - [572, 7358.53] + - [750, 7358.53] - - [64, 1024, 1, 2048] - - [490, 2102.51] + - [668, 2102.51] - - [2048, 512, 1, 3328] - - [574, 8694.69] + - [752, 8694.69] - - [4096, 1024, 1, 896] - - [480, 9343.02] + - [658, 9343.02] - - [1, 1024, 1, 2048] - - [530, 40.9324] + - [708, 40.9324] - - [4096, 200, 1, 3584] - - [578, 6810.3] + - [756, 6810.3] - - [4096, 1024, 1, 4096] - - [480, 9347.56] + - [658, 9347.56] - - [1024, 256, 1, 14336] - - [473, 6625.8] + - [651, 6625.8] - - [2048, 200, 1, 256] - - [572, 4413.3] + - [750, 4413.3] - - [4096, 256, 1, 16384] - - [474, 8752.13] + - [652, 8752.13] - - [4096, 256, 1, 1920] - - [565, 8533.78] + - [743, 8533.78] - - [32, 1024, 1, 512] - - [563, 647.369] + - [741, 647.369] - - [1024, 256, 1, 7680] - - [583, 6387.36] + - [761, 6387.36] - - [2048, 256, 1, 1664] - - [583, 7631.44] + - [761, 7631.44] - - [512, 200, 1, 1536] - - [488, 2576.88] + - [666, 2576.88] - - [2048, 1024, 1, 6144] - - [469, 9033.77] + - [647, 9033.77] - - [512, 256, 1, 2816] - - [535, 3977.46] + - [713, 3977.46] - - [4096, 512, 1, 4160] - - [482, 9289.02] + - [660, 9289.02] - - [4096, 512, 1, 2080] - - [561, 9150.28] + - [739, 9150.28] - - [2048, 256, 1, 15360] - - [546, 7963.97] + - [724, 7963.97] - - [4096, 200, 1, 5120] - - [585, 6861.62] + - [763, 6861.62] - - [1024, 512, 1, 8192] - - [571, 7473.25] + - [749, 7473.25] - - [4096, 200, 1, 896] - - [588, 6443.25] + - [766, 6443.25] - - [2048, 512, 1, 8320] - - [578, 8810.24] + - [756, 8810.24] - - [1024, 1024, 1, 10240] - - [586, 8436.7] + - [764, 8436.7] - - [1024, 200, 1, 768] - - [572, 4087.58] + - [750, 4087.58] - - [2048, 200, 1, 640] - - [583, 5416.3] + - [761, 5416.3] - - [512, 200, 1, 2048] - - [537, 2702.62] + - [715, 2702.62] - - [1024, 1024, 1, 9216] - - [575, 8499.08] + - [753, 8499.08] - - [4096, 200, 1, 1408] - - [585, 6613.82] + - [763, 6613.82] - - [1024, 256, 1, 13312] - - [473, 6643.54] + - [651, 6643.54] - - [1024, 256, 1, 128] - - [504, 2706.1] + - [682, 2706.1] - - [2048, 200, 1, 5632] - - [583, 6270.12] + - [761, 6270.12] - - [64, 1024, 1, 512] - - [562, 1310.82] + - [740, 1310.82] - - [1024, 512, 1, 2560] - - [583, 7731.54] + - [761, 7731.54] - - [4096, 200, 1, 1280] - - [565, 6566.83] + - [743, 6566.83] - - [1024, 200, 1, 4096] - - [583, 4911.46] + - [761, 4911.46] - - [1024, 1024, 1, 2560] - - [574, 8630.35] + - [752, 8630.35] - - [2048, 512, 1, 64] - - [578, 4152.88] + - [756, 4152.88] - - [2048, 200, 1, 8192] - - [572, 6234.21] + - [750, 6234.21] - - [2048, 512, 1, 3072] - - [582, 8614.85] + - [760, 8614.85] - - [4096, 1024, 1, 5120] - - [480, 9573.75] + - [658, 9573.75] - - [4096, 256, 1, 640] - - [567, 7913.88] + - [745, 7913.88] - - [1024, 256, 1, 1280] - - [572, 5706.64] + - [750, 5706.64] - - [2048, 1024, 1, 1920] - - [482, 9141.34] + - [660, 9141.34] - - [2048, 256, 1, 4096] - - [572, 7937.28] + - [750, 7937.28] - - [2048, 1024, 1, 15360] - - [485, 9351.96] + - [663, 9351.96] - - [4096, 200, 1, 16384] - - [474, 6975.21] + - [652, 6975.21] - - [1, 1024, 1, 4096] - - [592, 60.7815] + - [770, 60.7815] - - [4096, 1024, 1, 2816] - - [480, 9583.98] + - [658, 9583.98] - - [4096, 200, 1, 1664] - - [567, 6658.7] + - [745, 6658.7] - - [4096, 512, 1, 256] - - [498, 7731.54] + - [676, 7731.54] - - [1024, 200, 1, 896] - - [572, 4193.45] + - [750, 4193.45] - - [2048, 200, 1, 6656] - - [583, 6291.17] + - [761, 6291.17] - - [2048, 1024, 1, 5120] - - [482, 9270.57] + - [660, 9270.57] - - [512, 1024, 1, 768] - - [572, 7099.06] + - [750, 7099.06] - - [2048, 512, 1, 14336] - - [552, 8559.13] + - [730, 8559.13] - - [2048, 200, 1, 8320] - - [572, 6314.72] + - [750, 6314.72] - - [4096, 256, 1, 3840] - - [588, 8718.56] + - [766, 8718.56] - - [2048, 1024, 1, 4096] - - [469, 8973.38] + - [647, 8973.38] - - [1024, 1024, 1, 3200] - - [578, 8701.98] + - [756, 8701.98] - - [1024, 256, 1, 4608] - - [572, 6268.05] + - [750, 6268.05] - - [4096, 512, 1, 4608] - - [480, 9316.47] + - [658, 9316.47] - - [2048, 512, 1, 2048] - - [571, 8462.76] + - [749, 8462.76] - - [4096, 512, 1, 1664] - - [480, 9074.53] + - [658, 9074.53] - - [4096, 256, 1, 4608] - - [567, 8718.05] + - [745, 8718.05] - - [1024, 512, 1, 32] - - [560, 1807.99] + - [738, 1807.99] - - [1024, 512, 1, 3840] - - [572, 7936.34] + - [750, 7936.34] - - [2048, 512, 1, 1920] - - [588, 8548.27] + - [766, 8548.27] - - [2048, 1024, 1, 896] - - [480, 8843.51] + - [658, 8843.51] - - [4096, 200, 1, 6144] - - [588, 6864.76] + - [766, 6864.76] - - [1024, 512, 1, 13312] - - [541, 7763.19] + - [719, 7763.19] - - [4096, 1024, 1, 4160] - - [480, 9650.72] + - [658, 9650.72] - - [2048, 200, 1, 2816] - - [572, 6119.76] + - [750, 6119.76] - - [1024, 1024, 1, 3840] - - [567, 8709.5] + - [745, 8709.5] - - [128, 1024, 1, 1024] - - [590, 2577.25] + - [768, 2577.25] - - [2048, 1024, 1, 11264] - - [485, 9339.06] + - [663, 9339.06] - - [2048, 1024, 1, 384] - - [574, 8210.81] + - [752, 8210.81] - - [1024, 256, 1, 2048] - - [595, 5755.58] + - [773, 5755.58] - - [2048, 1024, 1, 3840] - - [482, 9288.96] + - [660, 9288.96] - - [4096, 256, 1, 8320] - - [588, 8812.38] + - [766, 8812.38] - - [2048, 256, 1, 3840] - - [564, 7857.05] + - [742, 7857.05] - - [64, 256, 1, 512] - - [562, 336.182] + - [740, 336.182] - - [4096, 512, 1, 1280] - - [482, 8993.52] + - [660, 8993.52] - - [512, 256, 1, 1280] - - [514, 2996.03] + - [692, 2996.03] - - [1024, 512, 1, 7680] - - [572, 8041.59] + - [750, 8041.59] - - [4096, 1024, 1, 1152] - - [480, 9368.48] + - [658, 9368.48] - - [256, 200, 1, 512] - - [514, 993.07] + - [692, 993.07] - - [256, 1024, 1, 2048] - - [591, 4759.59] + - [769, 4759.59] - - [2048, 200, 1, 10240] - - [583, 6329.03] + - [761, 6329.03] - - [2048, 512, 1, 5120] - - [584, 8732.56] + - [762, 8732.56] - - [2048, 1024, 1, 1408] - - [482, 9006.9] + - [660, 9006.9] - - [512, 1024, 1, 512] - - [572, 6528.2] + - [750, 6528.2] - - [1024, 200, 1, 11264] - - [539, 5194.82] + - [717, 5194.82] - - [512, 1024, 1, 1024] - - [525, 6337.1] + - [703, 6337.1] - - [2048, 512, 1, 32] - - [491, 2777.78] + - [669, 2777.78] - - [4096, 256, 1, 2560] - - [574, 8621.49] + - [752, 8621.49] - - [4096, 256, 1, 64] - - [508, 4194.4] + - [686, 4194.4] - - [32, 1024, 1, 1024] - - [509, 778.264] + - [687, 778.264] - - [2048, 200, 1, 768] - - [583, 5507.33] + - [761, 5507.33] - - [512, 512, 1, 2048] - - [531, 5338.91] + - [709, 5338.91] - - [2048, 512, 1, 2560] - - [585, 8643.69] + - [763, 8643.69] - - [512, 256, 1, 512] - - [564, 2542.1] + - [742, 2542.1] - - [1024, 200, 1, 7680] - - [539, 5047.8] + - [717, 5047.8] - - [4096, 512, 1, 896] - - [480, 8856.85] + - [658, 8856.85] - - [4096, 1024, 1, 3072] - - [480, 9492.17] + - [658, 9492.17] - - [4096, 200, 1, 13312] - - [472, 6900.73] + - [650, 6900.73] - - [2048, 512, 1, 7168] - - [573, 8788.1] + - [751, 8788.1] - - [2048, 1024, 1, 2816] - - [485, 9229.88] + - [663, 9229.88] - - [2048, 512, 1, 128] - - [503, 5630.04] + - [681, 5630.04] - - [1024, 256, 1, 8192] - - [595, 6203.83] + - [773, 6203.83] - - [4096, 1024, 1, 1792] - - [480, 9510.42] + - [658, 9510.42] - - [1024, 200, 1, 6656] - - [564, 5002.85] + - [742, 5002.85] - - [1024, 1024, 1, 1024] - - [501, 8095.26] + - [679, 8095.26] - - [4096, 200, 1, 2304] - - [585, 6754.45] + - [763, 6754.45] - - [4096, 512, 1, 1152] - - [480, 8974.54] + - [658, 8974.54] - - [512, 200, 1, 1024] - - [562, 2233.01] + - [740, 2233.01] - - [1024, 256, 1, 3840] - - [583, 6244.72] + - [761, 6244.72] - - [512, 512, 1, 768] - - [572, 5331.84] + - [750, 5331.84] - - [2048, 512, 1, 4096] - - [582, 8621.76] + - [760, 8621.76] - - [2048, 256, 1, 2560] - - [572, 7770.93] + - [750, 7770.93] - - [2048, 256, 1, 4160] - - [583, 7923.08] + - [761, 7923.08] - - [1024, 256, 1, 64] - - [479, 1705.1] + - [657, 1705.1] - - [4096, 512, 1, 7680] - - [480, 9364.57] + - [658, 9364.57] - - [1024, 512, 1, 1664] - - [583, 7594.24] + - [761, 7594.24] - - [2048, 512, 1, 2080] - - [574, 8570.67] + - [752, 8570.67] - - [2048, 512, 1, 3840] - - [585, 8729.14] + - [763, 8729.14] - - [4096, 1024, 1, 384] - - [480, 8764.86] + - [658, 8764.86] - - [4096, 200, 1, 3072] - - [574, 6772.39] + - [752, 6772.39] - - [1024, 512, 1, 14336] - - [542, 7680.97] + - [720, 7680.97] - - [1024, 200, 1, 1920] - - [564, 4637.08] + - [742, 4637.08] - - [1024, 1024, 1, 1664] - - [578, 8506.49] + - [756, 8506.49] - - [512, 1024, 1, 2304] - - [572, 7775.33] + - [750, 7775.33] - - [2048, 1024, 1, 1792] - - [480, 9123.46] + - [658, 9123.46] - - [32, 200, 1, 512] - - [580, 125.744] + - [758, 125.744] - - [4096, 256, 1, 11264] - - [585, 8822.31] + - [763, 8822.31] - - [4096, 256, 1, 1408] - - [585, 8419.32] + - [763, 8419.32] - - [1024, 256, 1, 7168] - - [572, 6377.54] + - [750, 6377.54] - - [2048, 256, 1, 1152] - - [583, 7401.81] + - [761, 7401.81] - - [256, 256, 1, 512] - - [562, 1314.93] + - [740, 1314.93] - - [1024, 512, 1, 1280] - - [572, 7410.53] + - [750, 7410.53] - - [512, 512, 1, 1792] - - [564, 5931.44] + - [742, 5931.44] - - [2048, 200, 1, 12288] - - [546, 6242.25] + - [724, 6242.25] - - [2048, 200, 1, 1664] - - [583, 5953.75] + - [761, 5953.75] - - [4096, 200, 1, 4608] - - [578, 6853.54] + - [756, 6853.54] - - [512, 1024, 1, 2560] - - [572, 7778.13] + - [750, 7778.13] - - [4096, 200, 1, 384] - - [565, 5765.73] + - [743, 5765.73] - - [128, 512, 1, 512] - - [562, 1302.68] + - [740, 1302.68] - - [1024, 200, 1, 256] - - [566, 2861.93] + - [744, 2861.93] - - [256, 1024, 1, 1024] - - [507, 4522.26] + - [685, 4522.26] - - [2048, 200, 1, 128] - - [572, 3310.0] + - [750, 3310.0] - - [2048, 200, 1, 11264] - - [553, 6168.2] + - [731, 6168.2] - - [1024, 512, 1, 1920] - - [583, 7649.29] + - [761, 7649.29] - - [4096, 256, 1, 1536] - - [578, 8427.33] + - [756, 8427.33] - - [4096, 1024, 1, 3584] - - [480, 9618.0] + - [658, 9618.0] - - [2048, 256, 1, 256] - - [572, 5464.99] + - [750, 5464.99] - - [2048, 1024, 1, 768] - - [480, 8726.87] + - [658, 8726.87] - - [4096, 256, 1, 10240] - - [574, 8790.89] + - [752, 8790.89] - - [2048, 256, 1, 10240] - - [554, 7665.31] + - [732, 7665.31] - - [4096, 200, 1, 14336] - - [588, 6916.18] + - [766, 6916.18] - - [1024, 512, 1, 5120] - - [526, 7420.36] + - [704, 7420.36] - - [1024, 512, 1, 8320] - - [583, 8061.31] + - [761, 8061.31] - - [256, 200, 1, 2048] - - [538, 1916.36] + - [716, 1916.36] - - [1024, 200, 1, 640] - - [566, 3873.39] + - [744, 3873.39] - - [1024, 512, 1, 10240] - - [571, 7526.9] + - [749, 7526.9] - - [1024, 200, 1, 4160] - - [583, 4928.19] + - [761, 4928.19] - - [1024, 200, 1, 5632] - - [564, 4978.66] + - [742, 4978.66] - - [1024, 1024, 1, 2048] - - [519, 7937.28] + - [697, 7937.28] - - [1024, 256, 1, 6656] - - [583, 6373.68] + - [761, 6373.68] - - [2048, 1024, 1, 8320] - - [480, 9333.15] + - [658, 9333.15] - - [1024, 256, 1, 10240] - - [572, 6407.29] + - [750, 6407.29] - - [2048, 256, 1, 2080] - - [572, 7714.58] + - [750, 7714.58] - - [4096, 256, 1, 128] - - [486, 5765.47] + - [664, 5765.47] - - [1024, 256, 1, 768] - - [577, 5210.42] + - [755, 5210.42] - - [2048, 256, 1, 896] - - [583, 7267.46] + - [761, 7267.46] - - [64, 512, 1, 2048] - - [549, 1296.64] + - [727, 1296.64] - - [4096, 512, 1, 2048] - - [482, 9121.25] + - [660, 9121.25] - - [512, 256, 1, 2048] - - [535, 3283.31] + - [713, 3283.31] - - [4096, 256, 1, 16640] - - [567, 8839.88] + - [745, 8839.88] - - [4096, 512, 1, 2560] - - [485, 9222.15] + - [663, 9222.15] - - [1024, 512, 1, 15360] - - [536, 7865.66] + - [714, 7865.66] - - [4096, 1024, 1, 2304] - - [480, 9558.26] + - [658, 9558.26] - - [4096, 200, 1, 1152] - - [585, 6531.93] + - [763, 6531.93] - - [2048, 200, 1, 6144] - - [583, 6277.75] + - [761, 6277.75] - - [1024, 1024, 1, 7680] - - [578, 8799.34] + - [756, 8799.34] - - [2048, 200, 1, 1920] - - [583, 6031.02] + - [761, 6031.02] - - [32, 1024, 1, 2048] - - [557, 1174.98] + - [735, 1174.98] - - [1024, 200, 1, 3584] - - [564, 4880.44] + - [742, 4880.44] - - [4096, 256, 1, 2080] - - [571, 8557.22] + - [749, 8557.22] - - [1024, 1024, 1, 16384] - - [472, 8618.65] + - [650, 8618.65] - - [1024, 256, 1, 1408] - - [583, 5803.54] + - [761, 5803.54] - - [1024, 256, 1, 4096] - - [593, 6037.78] + - [771, 6037.78] - - [2048, 200, 1, 14336] - - [583, 6364.48] + - [761, 6364.48] - - [4096, 512, 1, 5120] - - [482, 9302.05] + - [660, 9302.05] - - [1024, 512, 1, 6144] - - [518, 7469.09] + - [696, 7469.09] - - [1024, 512, 1, 2304] - - [583, 7759.35] + - [761, 7759.35] - - [4096, 200, 1, 4160] - - [567, 6843.22] + - [745, 6843.22] - - [4096, 200, 1, 1536] - - [578, 6628.27] + - [756, 6628.27] - - [4096, 1024, 1, 6144] - - [480, 9593.08] - - - [1280, 384, 1, 64] - - [611, 3196.98] + - [658, 9593.08] - - [256, 64, 1, 1225] - - [612, 1194.77] + - [790, 1194.77] - - [2048, 320, 1, 64] - - [614, 3449.36] - - - [256, 48, 1, 1225] - - [605, 913.498] - - - [2048, 192, 1, 64] - - [604, 2516.68] + - [792, 3449.36] - - [1024, 128, 1, 289] - - [618, 2869.78] - - - [1280, 192, 1, 64] - - [597, 1872.56] - - - [192, 32, 1, 1225] - - [602, 505.906] - - - [1280, 448, 1, 64] - - [598, 3078.97] + - [796, 2869.78] - - [384, 64, 1, 1225] - - [603, 1511.43] + - [781, 1511.43] - - [2048, 384, 1, 64] - - [616, 3836.35] - - - [288, 48, 1, 1225] - - [599, 1032.69] + - [794, 3836.35] - - [64, 80, 1, 5329] - - [615, 888.267] + - [793, 888.267] - - [1024, 384, 1, 289] - - [609, 4291.62] + - [787, 4291.62] - - [2048, 448, 1, 64] - - [608, 3783.62] - - - [1280, 320, 1, 64] - - [614, 2777.05] - - - [192, 64, 1, 1225] - - [599, 926.997] - - - [384, 192, 1, 1225] - - [610, 2560.1] - - - [1536, 256, 1, 64] - - [617, 2621.54] - - - [192, 48, 1, 1225] - - [602, 698.714] - - - [768, 128, 1, 289] - - [619, 2291.22] - - - [1024, 256, 1, 289] - - [617, 4064.46] + - [786, 3783.62] - - [768, 192, 1, 289] - - [613, 2690.43] - - - [1536, 384, 1, 64] - - [600, 3145.83] + - [791, 2690.43] - - [288, 64, 1, 1225] - - [602, 1142.77] - - - [1024, 192, 1, 289] - - [607, 3243.23] + - [780, 1142.77] - - [384, 96, 1, 1225] - - [620, 1844.81] - - - [160, 64, 1, 5329] - - [606, 1564.58] - - - [768, 160, 1, 289] - - [601, 2386.68] + - [798, 1844.81] - - [1024, 3392, 1, 4096] - - [646, 8503.02] + - [824, 8503.02] - - [1024, 3301, 1, 4096] - - [648, 8414.1] + - [826, 8414.1] - - [1024, 3443, 1, 4096] - - [635, 8536.59] + - [813, 8536.59] - - [132, 134, 480, 64] - - [673, 4149.27] + - [851, 4149.27] - - [162, 162, 400, 64] - - [661, 5539.73] + - [839, 5539.73] - - [4096, 3548, 1, 1024] - - [627, 9773.01] + - [805, 9773.01] - - [4096, 2977, 1, 1024] - - [628, 9574.43] + - [806, 9574.43] - - [132, 135, 480, 64] - - [673, 4167.51] + - [851, 4167.51] - - [1024, 2985, 1, 4096] - - [631, 9133.99] + - [809, 9133.99] - - [33708, 3681, 1, 1024] - - [628, 10033.8] + - [806, 10033.8] - - [4096, 3443, 1, 1024] - - [628, 9513.78] + - [806, 9513.78] - - [11, 11, 5456, 64] - - [670, 627.346] + - [848, 627.346] - - [1024, 3400, 1, 4096] - - [649, 8420.02] + - [827, 8420.02] - - [4096, 3995, 1, 1024] - - [627, 9693.87] + - [805, 9693.87] - - [4096, 3190, 1, 1024] - - [627, 9474.84] + - [805, 9474.84] - - [4096, 3594, 1, 1024] - - [628, 9315.83] + - [806, 9315.83] - - [159, 162, 400, 64] - - [660, 5429.98] + - [838, 5429.98] - - [1024, 3565, 1, 4096] - - [643, 8532.8] + - [821, 8532.8] - - [4096, 3422, 1, 1024] - - [628, 9459.24] + - [806, 9459.24] - - [1024, 3214, 1, 4096] - - [648, 8064.92] + - [826, 8064.92] - - [33708, 3584, 1, 1024] - - [629, 10129.0] + - [807, 10129.0] - - [33708, 3640, 1, 1024] - - [626, 9919.22] + - [804, 9919.22] - - [4096, 3263, 1, 1024] - - [626, 9699.35] + - [804, 9699.35] - - [4096, 3296, 1, 1024] - - [626, 9780.8] + - [804, 9780.8] - - [1024, 3557, 1, 4096] - - [647, 8526.89] + - [825, 8526.89] - - [4096, 3463, 1, 1024] - - [626, 9578.13] + - [804, 9578.13] - - [4096, 3528, 1, 1024] - - [626, 9739.92] + - [804, 9739.92] - - [14, 14, 4368, 64] - - [658, 991.276] + - [836, 991.276] - - [4096, 3226, 1, 1024] - - [626, 9587.19] + - [804, 9587.19] - - [4096, 3439, 1, 1024] - - [629, 9499.72] + - [807, 9499.72] - - [1024, 3523, 1, 4096] - - [649, 8393.58] + - [827, 8393.58] - - [1024, 3098, 1, 4096] - - [655, 7882.87] + - [833, 7882.87] - - [4096, 3121, 1, 1024] - - [626, 9296.23] + - [804, 9296.23] - - [33708, 3894, 1, 1024] - - [627, 9952.27] + - [805, 9952.27] - - [1024, 3548, 1, 4096] - - [633, 8432.45] + - [811, 8432.45] - - [1024, 3451, 1, 4096] - - [646, 8456.44] + - [824, 8456.44] - - [4096, 3353, 1, 1024] - - [628, 9289.08] + - [806, 9289.08] - - [4096, 3402, 1, 1024] - - [628, 9406.44] + - [806, 9406.44] - - [4096, 3939, 1, 1024] - - [626, 9549.59] + - [804, 9549.59] - - [133, 133, 480, 64] - - [673, 4124.31] + - [851, 4124.31] - - [1024, 3559, 1, 4096] - - [648, 8587.04] + - [826, 8587.04] - - [1024, 2977, 1, 4096] - - [631, 9084.59] + - [809, 9084.59] - - [1024, 3478, 1, 4096] - - [642, 8342.85] + - [820, 8342.85] - - [134, 134, 480, 64] - - [675, 4204.43] + - [853, 4204.43] - - [1024, 3368, 1, 4096] - - [648, 8277.43] + - [826, 8277.43] - - [4096, 4012, 1, 1024] - - [628, 9726.57] + - [806, 9726.57] - - [4096, 3486, 1, 1024] - - [626, 9639.71] + - [804, 9639.71] - - [1024, 3479, 1, 4096] - - [636, 8420.37] + - [814, 8420.37] - - [1024, 3505, 1, 4096] - - [648, 8310.66] + - [826, 8310.66] - - [4096, 3381, 1, 1024] - - [629, 9357.75] + - [807, 9357.75] - - [4096, 3430, 1, 1024] - - [626, 9482.36] + - [804, 9482.36] - - [1024, 3554, 1, 4096] - - [648, 8592.38] + - [826, 8592.38] - - [4096, 3271, 1, 1024] - - [626, 9715.41] + - [804, 9715.41] - - [1024, 3063, 1, 4096] - - [630, 9388.56] + - [808, 9388.56] - - [1024, 3209, 1, 4096] - - [648, 8212.74] + - [826, 8212.74] - - [4096, 3503, 1, 1024] - - [628, 9680.59] + - [806, 9680.59] - - [4096, 3344, 1, 1024] - - [626, 9268.55] + - [804, 9268.55] - - [1024, 3147, 1, 4096] - - [649, 8037.2] + - [827, 8037.2] - - [1024, 3322, 1, 4096] - - [647, 8356.32] + - [825, 8356.32] - - [1024, 3341, 1, 4096] - - [648, 8316.33] + - [826, 8316.33] - - [1024, 3516, 1, 4096] - - [630, 8397.12] + - [808, 8397.12] - - [102, 101, 624, 64] - - [661, 4709.59] + - [839, 4709.59] - - [1024, 3454, 1, 4096] - - [647, 8425.6] + - [825, 8425.6] - - [4096, 3969, 1, 1024] - - [628, 9640.15] + - [806, 9640.15] - - [4096, 3466, 1, 1024] - - [628, 9576.83] + - [806, 9576.83] - - [1024, 3999, 1, 1024] - - [631, 9207.15] + - [809, 9207.15] - - [1024, 4032, 1, 1024] - - [632, 9294.56] + - [810, 9294.56] - - [1024, 3403, 1, 4096] - - [646, 8357.97] + - [824, 8357.97] - - [4096, 3361, 1, 1024] - - [628, 9308.78] + - [806, 9308.78] - - [1024, 3527, 1, 4096] - - [647, 8512.19] + - [825, 8512.19] - - [1024, 3822, 1, 4096] - - [631, 8991.13] + - [809, 8991.13] - - [4096, 3315, 1, 1024] - - [626, 9834.96] + - [804, 9834.96] - - [232, 232, 272, 64] - - [660, 6481.62] + - [838, 6481.62] - - [1024, 3336, 1, 4096] - - [649, 8295.61] + - [827, 8295.61] - - [228, 232, 272, 64] - - [661, 6327.85] + - [839, 6327.85] - - [4096, 3547, 1, 1024] - - [626, 9781.56] + - [804, 9781.56] - - [4096, 3340, 1, 1024] - - [628, 9269.72] + - [806, 9269.72] - - [1024, 3906, 1, 1024] - - [632, 9018.38] + - [810, 9018.38] - - [1024, 3295, 1, 4096] - - [646, 8194.83] + - [824, 8194.83] - - [4096, 3294, 1, 1024] - - [629, 9762.16] + - [807, 9762.16] - - [33708, 3968, 1, 1024] - - [629, 10147.8] + - [807, 10147.8] - - [1024, 3473, 1, 4096] - - [635, 8318.68] + - [813, 8318.68] - - [1024, 3072, 1, 4096] - - [632, 9370.13] + - [810, 9370.13] - - [4096, 3189, 1, 1024] - - [626, 9470.26] + - [804, 9470.26] - - [4096, 3494, 1, 1024] - - [626, 9661.32] + - [804, 9661.32] - - [1024, 3522, 1, 4096] - - [649, 8459.23] + - [827, 8459.23] - - [33708, 3944, 1, 1024] - - [629, 10060.2] + - [807, 10060.2] - - [135, 135, 480, 64] - - [674, 4257.03] + - [852, 4257.03] - - [4096, 3421, 1, 1024] - - [626, 9456.98] + - [804, 9456.98] - - [32, 32, 1984, 64] - - [671, 3436.24] + - [849, 3436.24] - - [4096, 3311, 1, 1024] - - [626, 9810.88] + - [804, 9810.88] - - [1024, 3990, 1, 1024] - - [633, 9197.74] + - [811, 9197.74] - - [1024, 3290, 1, 4096] - - [646, 8229.63] + - [824, 8229.63] - - [4096, 3565, 1, 1024] - - [627, 9824.48] + - [805, 9824.48] - - [1024, 3484, 1, 4096] - - [636, 8575.38] + - [814, 8575.38] - - [4096, 3384, 1, 1024] - - [626, 9366.54] + - [804, 9366.54] - - [1024, 3422, 1, 4096] - - [646, 8484.12] + - [824, 8484.12] - - [4096, 3681, 1, 1024] - - [627, 9520.16] + - [805, 9520.16] - - [1024, 3584, 1, 1024] - - [653, 8583.37] + - [831, 8583.37] - - [4096, 4050, 1, 1024] - - [628, 9807.35] + - [806, 9807.35] - - [1024, 3996, 1, 4096] - - [629, 9181.7] + - [807, 9181.7] - - [4096, 3169, 1, 1024] - - [627, 9411.4] + - [805, 9411.4] - - [4096, 3538, 1, 1024] - - [627, 9765.99] + - [805, 9765.99] - - [1024, 3495, 1, 4096] - - [633, 8295.95] + - [811, 8295.95] - - [4096, 3401, 1, 1024] - - [626, 9402.68] + - [804, 9402.68] - - [1024, 3560, 1, 4096] - - [647, 8513.45] + - [825, 8513.45] - - [133, 135, 480, 64] - - [674, 4199.08] + - [852, 4199.08] - - [1024, 3263, 1, 4096] - - [648, 8172.23] + - [826, 8172.23] - - [1024, 3870, 1, 4096] - - [628, 8996.27] + - [806, 8996.27] - - [4096, 3555, 1, 1024] - - [629, 9811.88] + - [807, 9811.88] - - [4096, 3412, 1, 1024] - - [626, 9432.09] + - [804, 9432.09] - - [101, 101, 624, 64] - - [660, 4667.69] + - [838, 4667.69] - - [1024, 3296, 1, 4096] - - [647, 8350.61] + - [825, 8350.61] - - [1024, 3379, 1, 4096] - - [649, 8432.94] + - [827, 8432.94] - - [4096, 3302, 1, 1024] - - [626, 9796.39] + - [804, 9796.39] - - [1024, 3490, 1, 4096] - - [646, 8538.44] + - [824, 8538.44] - - [1024, 3428, 1, 4096] - - [647, 8531.67] + - [825, 8531.67] - - [1024, 3976, 1, 4096] - - [628, 9327.87] + - [806, 9327.87] - - [4096, 3485, 1, 1024] - - [626, 9628.82] + - [804, 9628.82] - - [4096, 3534, 1, 1024] - - [626, 9755.97] + - [804, 9755.97] - - [1024, 3064, 1, 4096] - - [632, 9196.98] + - [810, 9196.98] - - [4096, 3216, 1, 1024] - - [628, 9563.44] + - [806, 9563.44] - - [1024, 3450, 1, 4096] - - [656, 8519.29] + - [834, 8519.29] - - [1024, 3533, 1, 4096] - - [647, 8495.77] + - [825, 8495.77] - - [1024, 4030, 1, 1024] - - [632, 9304.68] + - [810, 9304.68] - - [1024, 3311, 1, 4096] - - [647, 8278.6] + - [825, 8278.6] - - [1024, 3468, 1, 4096] - - [638, 8564.55] + - [816, 8564.55] - - [23, 23, 2720, 64] - - [662, 2311.55] + - [840, 2311.55] - - [4096, 3359, 1, 1024] - - [628, 9309.15] + - [806, 9309.15] - - [4096, 3392, 1, 1024] - - [628, 9388.19] + - [806, 9388.19] - - [1024, 3925, 1, 1024] - - [630, 9006.72] + - [808, 9006.72] - - [4096, 3233, 1, 1024] - - [626, 9603.64] + - [804, 9603.64] - - [4096, 3956, 1, 1024] - - [627, 9581.94] + - [805, 9581.94] - - [1024, 3463, 1, 4096] - - [648, 8293.97] + - [826, 8293.97] - - [1024, 3126, 1, 4096] - - [647, 7978.13] + - [825, 7978.13] - - [1024, 3363, 1, 4096] - - [640, 8267.47] + - [818, 8267.47] - - [4096, 3465, 1, 1024] - - [626, 9590.74] + - [804, 9590.74] - - [33708, 3996, 1, 1024] - - [627, 9899.99] + - [805, 9899.99] - - [1024, 3231, 1, 4096] - - [648, 8231.68] + - [826, 8231.68] - - [33708, 3978, 1, 1024] - - [627, 9853.64] + - [805, 9853.64] - - [4096, 3476, 1, 1024] - - [626, 9616.62] + - [804, 9616.62] - - [85, 85, 752, 64] - - [658, 4240.65] + - [836, 4240.65] - - [4096, 3339, 1, 1024] - - [628, 9249.81] + - [806, 9249.81] - - [4096, 3452, 1, 1024] - - [626, 9534.13] + - [804, 9534.13] - - [1024, 3396, 1, 4096] - - [647, 8451.23] + - [825, 8451.23] - - [4096, 3293, 1, 1024] - - [628, 9775.22] + - [806, 9775.22] - - [54, 54, 1184, 64] - - [660, 4153.54] + - [838, 4153.54] - - [1024, 3432, 1, 4096] - - [641, 8345.53] + - [819, 8345.53] - - [4096, 3493, 1, 1024] - - [629, 9649.9] + - [807, 9649.9] - - [4096, 3350, 1, 1024] - - [628, 9273.91] + - [806, 9273.91] - - [1024, 3079, 1, 4096] - - [656, 7775.66] + - [834, 7775.66] - - [1024, 3101, 1, 4096] - - [656, 7847.85] + - [834, 7847.85] - - [33708, 3939, 1, 1024] - - [629, 10054.4] + - [807, 10054.4] - - [4096, 3256, 1, 1024] - - [628, 9681.83] + - [806, 9681.83] - - [1024, 3439, 1, 4096] - - [647, 8531.11] + - [825, 8531.11] - - [1024, 3510, 1, 4096] - - [646, 8422.31] + - [824, 8422.31] - - [4096, 3900, 1, 1024] - - [627, 9468.61] + - [805, 9468.61] - - [1024, 3470, 1, 4096] - - [648, 8507.77] + - [826, 8507.77] - - [4096, 3456, 1, 1024] - - [628, 9577.46] + - [806, 9577.46] - - [4096, 3014, 1, 1024] - - [627, 9666.15] + - [805, 9666.15] - - [4096, 3367, 1, 1024] - - [629, 9328.36] + - [807, 9328.36] - - [4096, 3432, 1, 1024] - - [626, 9480.88] + - [804, 9480.88] - - [33708, 4026, 1, 1024] - - [629, 9972.83] + - [807, 9972.83] - - [4096, 3273, 1, 1024] - - [626, 9716.95] + - [804, 9716.95] - - [4096, 3130, 1, 1024] - - [626, 9311.4] + - [804, 9311.4] - - [1024, 3496, 1, 4096] - - [637, 8434.65] + - [815, 8434.65] - - [1024, 3995, 1, 4096] - - [622, 9157.73] + - [800, 9157.73] - - [1024, 3939, 1, 4096] - - [630, 9059.86] + - [808, 9059.86] - - [1024, 3121, 1, 4096] - - [654, 7963.43] + - [832, 7963.43] - - [1024, 3232, 1, 4096] - - [648, 8061.09] + - [826, 8061.09] - - [4096, 3147, 1, 1024] - - [628, 9364.63] + - [806, 9364.63] - - [4096, 3516, 1, 1024] - - [626, 9708.84] + - [804, 9708.84] - - [1024, 3969, 1, 1024] - - [632, 9168.68] + - [810, 9168.68] - - [1024, 3364, 1, 4096] - - [636, 8363.65] + - [814, 8363.65] - - [4096, 3411, 1, 1024] - - [629, 9442.77] + - [807, 9442.77] - - [147, 147, 432, 64] - - [673, 4843.21] + - [851, 4843.21] - - [4096, 3301, 1, 1024] - - [628, 9783.46] + - [806, 9783.46] - - [112, 111, 576, 64] - - [660, 5627.47] + - [838, 5627.47] - - [1024, 3513, 1, 4096] - - [647, 8725.41] + - [825, 8725.41] - - [1024, 3469, 1, 4096] - - [627, 8183.11] + - [805, 8183.11] - - [1024, 3095, 1, 4096] - - [648, 7887.87] + - [826, 7887.87] - - [4096, 3533, 1, 1024] - - [627, 9755.27] + - [805, 9755.27] - - [4096, 3390, 1, 1024] - - [626, 9377.21] + - [804, 9377.21] - - [4096, 3582, 1, 1024] - - [626, 9874.96] + - [804, 9874.96] - - [1024, 3956, 1, 1024] - - [632, 9058.82] + - [810, 9058.82] - - [4096, 3585, 1, 1024] - - [628, 9289.75] + - [806, 9289.75] - - [4096, 3231, 1, 1024] - - [627, 9597.15] + - [805, 9597.15] - - [1024, 3205, 1, 4096] - - [646, 8073.25] + - [824, 8073.25] - - [4096, 3496, 1, 1024] - - [627, 9668.38] + - [805, 9668.38] - - [1024, 3143, 1, 4096] - - [646, 8031.68] + - [824, 8031.68] - - [1024, 3318, 1, 4096] - - [643, 8261.43] + - [821, 8261.43] - - [1024, 3353, 1, 4096] - - [647, 8414.92] + - [825, 8414.92] - - [1024, 3464, 1, 4096] - - [646, 8310.03] + - [824, 8310.03] - - [4096, 2736, 1, 1024] - - [628, 9563.12] + - [806, 9563.12] - - [1024, 3402, 1, 4096] - - [643, 8413.84] + - [821, 8413.84] - - [4096, 3138, 1, 1024] - - [628, 9342.09] + - [806, 9342.09] - - [1024, 3860, 1, 4096] - - [631, 9008.57] + - [809, 9008.57] - - [148, 148, 432, 64] - - [673, 4915.7] + - [851, 4915.7] - - [1024, 3539, 1, 4096] - - [643, 8449.36] + - [821, 8449.36] - - [4096, 3211, 1, 1024] - - [628, 9551.28] + - [806, 9551.28] - - [1024, 3332, 1, 4096] - - [636, 8295.11] + - [814, 8295.11] - - [1024, 3466, 1, 4096] - - [647, 8339.25] + - [825, 8339.25] - - [4096, 3475, 1, 1024] - - [626, 9612.33] + - [804, 9612.33] - - [4096, 3524, 1, 1024] - - [629, 9722.74] + - [807, 9722.74] - - [4096, 2985, 1, 1024] - - [629, 9591.33] + - [807, 9591.33] - - [4096, 3222, 1, 1024] - - [626, 9577.48] + - [804, 9577.48] - - [4096, 3451, 1, 1024] - - [628, 9541.42] + - [806, 9541.42] - - [1024, 3181, 1, 4096] - - [646, 8118.89] + - [824, 8118.89] - - [1024, 3640, 1, 4096] - - [631, 8617.11] + - [809, 8617.11] - - [1024, 3375, 1, 4096] - - [635, 8419.75] + - [813, 8419.75] - - [1024, 3550, 1, 4096] - - [648, 8512.83] + - [826, 8512.83] - - [1024, 4020, 1, 1024] - - [632, 9266.9] + - [810, 9266.9] - - [1024, 3840, 1, 4096] - - [631, 8983.49] + - [809, 8983.49] - - [4096, 3349, 1, 1024] - - [626, 9279.96] + - [804, 9279.96] - - [4096, 3398, 1, 1024] - - [627, 9402.32] + - [805, 9402.32] - - [33708, 3976, 1, 1024] - - [628, 9849.54] + - [806, 9849.54] - - [1024, 2917, 1, 4096] - - [633, 8936.87] + - [811, 8936.87] - - [33708, 3910, 1, 1024] - - [626, 9983.35] + - [804, 9983.35] - - [4096, 3860, 1, 1024] - - [627, 9377.58] + - [805, 9377.58] - - [4096, 3304, 1, 1024] - - [629, 9798.44] + - [807, 9798.44] - - [1024, 3286, 1, 4096] - - [634, 8167.41] + - [812, 8167.41] - - [1024, 3460, 1, 4096] - - [644, 8539.56] + - [822, 8539.56] - - [1024, 4026, 1, 4096] - - [630, 9305.68] + - [808, 9305.68] - - [4096, 3471, 1, 1024] - - [628, 9596.71] + - [806, 9596.71] - - [193, 193, 320, 64] - - [676, 4758.46] + - [854, 4758.46] - - [1024, 3894, 1, 1024] - - [630, 8979.6] + - [808, 8979.6] - - [65, 65, 992, 64] - - [672, 2565.49] + - [850, 2565.49] - - [1024, 3506, 1, 4096] - - [644, 8593.22] + - [822, 8593.22] - - [35, 35, 1808, 64] - - [666, 2129.72] + - [844, 2129.72] - - [1024, 4000, 1, 1024] - - [630, 9204.6] + - [808, 9204.6] - - [1024, 3900, 1, 4096] - - [626, 9050.36] + - [804, 9050.36] - - [1024, 3445, 1, 4096] - - [649, 8551.65] + - [827, 8551.65] - - [4096, 3442, 1, 1024] - - [627, 9505.0] + - [805, 9505.0] - - [1024, 3358, 1, 4096] - - [648, 8437.16] + - [826, 8437.16] - - [13, 13, 4672, 64] - - [659, 860.665] + - [837, 860.665] - - [1024, 3211, 1, 4096] - - [652, 8085.25] + - [830, 8085.25] - - [4096, 3515, 1, 1024] - - [628, 9715.29] + - [806, 9715.29] - - [1024, 3564, 1, 4096] - - [634, 8760.37] + - [812, 8760.37] - - [4096, 3057, 1, 1024] - - [628, 9804.05] + - [806, 9804.05] - - [1024, 3343, 1, 4096] - - [646, 8363.8] + - [824, 8363.8] - - [4096, 3262, 1, 1024] - - [627, 9686.49] + - [805, 9686.49] - - [1024, 3518, 1, 4096] - - [646, 8455.05] + - [824, 8455.05] - - [77, 77, 816, 64] - - [665, 3505.94] + - [843, 3505.94] - - [33708, 3876, 1, 1024] - - [627, 9895.95] + - [805, 9895.95] - - [4096, 3462, 1, 1024] - - [628, 9570.31] + - [806, 9570.31] - - [1024, 3265, 1, 4096] - - [646, 8322.75] + - [824, 8322.75] - - [4096, 3389, 1, 1024] - - [627, 9382.86] + - [805, 9382.86] - - [4096, 3438, 1, 1024] - - [628, 9503.47] + - [806, 9503.47] - - [1024, 3955, 1, 1024] - - [630, 9064.45] + - [808, 9064.45] - - [1024, 3545, 1, 4096] - - [649, 8652.41] + - [827, 8652.41] - - [1024, 3144, 1, 4096] - - [649, 8060.55] + - [827, 8060.55] - - [1024, 3417, 1, 4096] - - [647, 8505.91] + - [825, 8505.91] - - [4096, 3543, 1, 1024] - - [626, 9775.67] + - [804, 9775.67] - - [4096, 3352, 1, 1024] - - [628, 9282.87] + - [806, 9282.87] - - [33708, 3975, 1, 1024] - - [629, 9849.49] + - [807, 9849.49] - - [148, 147, 432, 64] - - [673, 4876.15] + - [851, 4876.15] - - [4096, 3137, 1, 1024] - - [626, 9330.63] + - [804, 9330.63] - - [4096, 3506, 1, 1024] - - [629, 9682.76] + - [807, 9682.76] - - [1024, 3975, 1, 1024] - - [632, 9164.77] + - [810, 9164.77] - - [1024, 3859, 1, 4096] - - [630, 8983.84] + - [808, 8983.84] - - [4096, 3369, 1, 1024] - - [628, 9330.45] + - [806, 9330.45] - - [1024, 3434, 1, 4096] - - [646, 8486.98] + - [824, 8486.98] - - [1024, 3292, 1, 4096] - - [646, 8478.96] + - [824, 8478.96] - - [4096, 3523, 1, 1024] - - [626, 9734.83] + - [804, 9734.83] - - [4096, 3380, 1, 1024] - - [628, 9354.49] + - [806, 9354.49] - - [1024, 3408, 1, 4096] - - [649, 8441.03] + - [827, 8441.03] - - [4096, 3221, 1, 1024] - - [628, 9575.59] + - [806, 9575.59] - - [4096, 3270, 1, 1024] - - [628, 9717.95] + - [806, 9717.95] - - [143, 143, 432, 64] - - [674, 4643.45] + - [852, 4643.45] - - [111, 111, 576, 64] - - [666, 5475.04] + - [844, 5475.04] - - [1024, 3303, 1, 4096] - - [648, 8413.07] + - [826, 8413.07] - - [4096, 3502, 1, 1024] - - [628, 9679.87] + - [806, 9679.87] - - [1024, 3222, 1, 4096] - - [648, 8141.88] + - [826, 8141.88] - - [4096, 2505, 1, 1024] - - [626, 9594.95] + - [804, 9594.95] - - [4096, 3397, 1, 1024] - - [626, 9392.61] + - [804, 9392.61] - - [4096, 3562, 1, 1024] - - [626, 9827.58] + - [804, 9827.58] - - [4096, 3095, 1, 1024] - - [628, 9222.45] + - [806, 9222.45] - - [1024, 3226, 1, 4096] - - [644, 8027.03] + - [822, 8027.03] - - [177, 177, 352, 64] - - [661, 6406.96] + - [839, 6406.96] - - [4096, 3360, 1, 1024] - - [627, 9298.15] + - [805, 9298.15] - - [1024, 3942, 1, 1024] - - [632, 9061.59] + - [810, 9061.59] - - [1024, 3298, 1, 4096] - - [649, 8254.36] + - [827, 8254.36] - - [1024, 3381, 1, 4096] - - [648, 8508.81] + - [826, 8508.81] - - [4096, 3314, 1, 1024] - - [628, 9837.56] + - [806, 9837.56] - - [1024, 3492, 1, 4096] - - [636, 8583.39] + - [814, 8583.39] - - [1024, 3430, 1, 4096] - - [636, 8492.71] + - [814, 8492.71] - - [4096, 3977, 1, 1024] - - [628, 9656.45] + - [806, 9656.45] - - [4096, 3546, 1, 1024] - - [626, 9780.35] + - [804, 9780.35] - - [4096, 3640, 1, 1024] - - [626, 9415.51] + - [804, 9415.51] - - [4096, 3441, 1, 1024] - - [627, 9499.24] + - [805, 9499.24] - - [33708, 4059, 1, 1024] - - [629, 10051.9] + - [807, 10051.9] - - [1024, 3978, 1, 1024] - - [630, 9158.8] + - [808, 9158.8] - - [1024, 3376, 1, 4096] - - [648, 8415.44] + - [826, 8415.44] - - [1024, 3482, 1, 4096] - - [649, 8396.62] + - [827, 8396.62] - - [1024, 3563, 1, 4096] - - [632, 8424.18] + - [810, 8424.18] - - [4096, 4020, 1, 1024] - - [629, 9745.96] + - [807, 9745.96] - - [1024, 3271, 1, 4096] - - [647, 8289.68] + - [825, 8289.68] - - [1024, 3291, 1, 4096] - - [647, 8222.71] + - [825, 8222.71] - - [1024, 3431, 1, 4096] - - [642, 8464.4] + - [820, 8464.4] - - [1024, 3481, 1, 4096] - - [648, 8386.5] + - [826, 8386.5] - - [84, 85, 752, 64] - - [663, 4194.85] + - [841, 4194.85] - - [4096, 3461, 1, 1024] - - [626, 9579.67] + - [804, 9579.67] - - [1024, 3574, 1, 4096] - - [649, 8579.8] + - [827, 8579.8] - - [1024, 4059, 1, 1024] - - [630, 9330.54] + - [808, 9330.54] - - [84, 84, 752, 64] - - [670, 4141.46] + - [848, 4141.46] - - [1024, 3421, 1, 4096] - - [649, 8528.42] + - [827, 8528.42] - - [4096, 3224, 1, 1024] - - [628, 9589.95] + - [806, 9589.95] - - [4096, 3437, 1, 1024] - - [628, 9498.2] + - [806, 9498.2] - - [45, 45, 1424, 64] - - [660, 3314.58] + - [838, 3314.58] - - [4096, 3840, 1, 1024] - - [626, 9931.37] + - [804, 9931.37] - - [4096, 3168, 1, 1024] - - [628, 9412.16] + - [806, 9412.16] - - [33708, 3990, 1, 1024] - - [626, 9884.39] + - [804, 9884.39] - - [1024, 3349, 1, 4096] - - [648, 8421.4] + - [826, 8421.4] - - [4096, 3335, 1, 1024] - - [626, 9241.65] + - [804, 9241.65] - - [4096, 3400, 1, 1024] - - [628, 9407.35] + - [806, 9407.35] - - [160, 159, 400, 64] - - [675, 5708.94] + - [853, 5708.94] - - [1024, 3398, 1, 4096] - - [648, 8624.03] + - [826, 8624.03] - - [1024, 3780, 1, 4096] - - [628, 8756.78] + - [806, 8756.78] - - [29, 29, 2176, 64] - - [671, 2963.69] + - [849, 2963.69] - - [4096, 3098, 1, 1024] - - [626, 9229.82] + - [804, 9229.82] - - [1024, 4012, 1, 4096] - - [632, 9422.03] + - [810, 9422.03] - - [4096, 3505, 1, 1024] - - [628, 9687.65] + - [806, 9687.65] - - [4096, 3554, 1, 1024] - - [628, 9812.22] + - [806, 9812.22] - - [4096, 3063, 1, 1024] - - [628, 9825.1] + - [806, 9825.1] - - [1024, 3503, 1, 4096] - - [646, 8404.74] + - [824, 8404.74] - - [1024, 3166, 1, 4096] - - [649, 8084.93] + - [827, 8084.93] - - [1024, 3425, 1, 4096] - - [649, 8537.58] + - [827, 8537.58] - - [1024, 3344, 1, 4096] - - [640, 8351.16] + - [818, 8351.16] - - [4096, 3484, 1, 1024] - - [628, 9635.7] + - [806, 9635.7] - - [1024, 3681, 1, 1024] - - [631, 8457.18] + - [809, 8457.18] - - [1024, 4050, 1, 1024] - - [632, 9326.21] + - [810, 9326.21] - - [4096, 3379, 1, 1024] - - [626, 9356.16] + - [804, 9356.16] - - [4096, 3428, 1, 1024] - - [627, 9472.33] + - [805, 9472.33] - - [12, 12, 5040, 64] - - [665, 741.617] + - [843, 741.617] - - [27, 27, 2336, 64] - - [671, 2757.9] + - [849, 2757.9] - - [1024, 3304, 1, 4096] - - [649, 8317.82] + - [827, 8317.82] - - [1024, 3387, 1, 4096] - - [647, 8460.15] + - [825, 8460.15] - - [4096, 3126, 1, 1024] - - [629, 9308.48] + - [807, 9308.48] - - [1024, 3498, 1, 4096] - - [646, 8485.55] + - [824, 8485.55] - - [1024, 3436, 1, 4096] - - [648, 8397.71] + - [826, 8397.71] - - [4096, 3501, 1, 1024] - - [626, 9681.19] + - [804, 9681.19] - - [4096, 3358, 1, 1024] - - [628, 9304.9] + - [806, 9304.9] - - [4096, 3232, 1, 1024] - - [626, 9607.2] + - [804, 9607.2] - - [1024, 3585, 1, 4096] - - [630, 8510.74] + - [808, 8510.74] - - [4096, 3143, 1, 1024] - - [629, 9355.91] + - [807, 9355.91] - - [4096, 3464, 1, 1024] - - [628, 9585.95] + - [806, 9585.95] - - [1024, 3366, 1, 4096] - - [636, 8275.23] + - [814, 8275.23] - - [4096, 3375, 1, 1024] - - [626, 9342.13] + - [804, 9342.13] - - [4096, 2917, 1, 1024] - - [626, 9372.84] + - [804, 9372.84] - - [4096, 4026, 1, 1024] - - [628, 9759.15] + - [806, 9759.15] - - [49, 49, 1296, 64] - - [667, 3710.02] + - [845, 3710.02] - - [1024, 3277, 1, 4096] - - [647, 8217.1] + - [825, 8217.1] - - [1024, 3103, 1, 4096] - - [648, 7872.67] + - [826, 7872.67] - - [33708, 3995, 1, 1024] - - [628, 9893.08] + - [806, 9893.08] - - [1024, 3297, 1, 4096] - - [647, 8185.82] + - [825, 8185.82] - - [4096, 3545, 1, 1024] - - [628, 9789.43] + - [806, 9789.43] - - [1024, 3399, 1, 4096] - - [647, 8377.18] + - [825, 8377.18] - - [33708, 3796, 1, 1024] - - [627, 10008.0] + - [805, 10008.0] - - [4096, 3292, 1, 1024] - - [628, 9767.28] + - [806, 9767.28] - - [71, 71, 896, 64] - - [662, 3006.25] + - [840, 3006.25] - - [33708, 3859, 1, 1024] - - [629, 9860.37] + - [807, 9860.37] - - [4096, 3566, 1, 1024] - - [628, 9834.47] + - [806, 9834.47] - - [4096, 3894, 1, 1024] - - [626, 9456.67] + - [804, 9456.67] - - [4096, 3492, 1, 1024] - - [626, 9653.24] + - [804, 9653.24] - - [1024, 3977, 1, 1024] - - [632, 9161.33] + - [810, 9161.33] - - [1024, 3272, 1, 4096] - - [649, 8257.09] + - [827, 8257.09] - - [135, 134, 480, 64] - - [673, 4238.39] + - [851, 4238.39] - - [1024, 3355, 1, 4096] - - [647, 8374.64] + - [825, 8374.64] - - [4096, 3419, 1, 1024] - - [629, 9455.44] + - [807, 9455.44] - - [1024, 3404, 1, 4096] - - [648, 8580.28] + - [826, 8580.28] - - [4096, 3999, 1, 1024] - - [628, 9701.78] + - [806, 9701.78] - - [4096, 3166, 1, 1024] - - [626, 9410.48] + - [804, 9410.48] - - [33708, 3840, 1, 1024] - - [629, 10132.9] + - [807, 10132.9] - - [4096, 4032, 1, 1024] - - [629, 9762.86] + - [807, 9762.86] - - [1024, 3573, 1, 4096] - - [647, 8603.4] + - [825, 8603.4] - - [4096, 3366, 1, 1024] - - [629, 9322.63] + - [807, 9322.63] - - [1024, 3541, 1, 4096] - - [649, 8405.9] + - [827, 8405.9] - - [4096, 3207, 1, 1024] - - [626, 9544.25] + - [804, 9544.25] - - [4096, 3272, 1, 1024] - - [628, 9716.73] + - [806, 9716.73] - - [1024, 3334, 1, 4096] - - [646, 8241.39] + - [824, 8241.39] - - [228, 228, 272, 64] - - [661, 6232.45] + - [839, 6232.45] - - [4096, 3183, 1, 1024] - - [628, 9452.44] + - [806, 9452.44] - - [4096, 3536, 1, 1024] - - [627, 9759.44] + - [805, 9759.44] - - [1024, 4005, 1, 1024] - - [631, 9225.83] + - [809, 9225.83] - - [1024, 3245, 1, 4096] - - [648, 8074.31] + - [826, 8074.31] - - [4096, 3447, 1, 1024] - - [627, 9525.84] + - [805, 9525.84] - - [1024, 3183, 1, 4096] - - [647, 8121.62] + - [825, 8121.62] - - [1024, 3361, 1, 4096] - - [649, 8285.86] + - [827, 8285.86] - - [33708, 3870, 1, 1024] - - [627, 9879.35] + - [805, 9879.35] - - [1024, 3321, 1, 4096] - - [648, 8408.67] + - [826, 8408.67] - - [1024, 3968, 1, 1024] - - [630, 9202.05] + - [808, 9202.05] - - [1024, 3486, 1, 4096] - - [644, 8258.89] + - [822, 8258.89] - - [4096, 4005, 1, 1024] - - [628, 9723.98] + - [806, 9723.98] - - [4096, 3410, 1, 1024] - - [629, 9440.5] + - [807, 9440.5] - - [1024, 3944, 1, 1024] - - [632, 9040.82] + - [810, 9040.82] - - [4096, 3300, 1, 1024] - - [627, 9789.9] + - [805, 9789.9] - - [4096, 3579, 1, 1024] - - [629, 9859.44] + - [807, 9859.44] - - [4096, 3483, 1, 1024] - - [629, 9624.31] + - [807, 9624.31] - - [4096, 3532, 1, 1024] - - [628, 9742.76] + - [806, 9742.76] - - [1024, 3140, 1, 4096] - - [648, 7899.65] + - [826, 7899.65] - - [1024, 3372, 1, 4096] - - [646, 8237.07] + - [824, 8237.07] - - [1024, 3224, 1, 4096] - - [649, 8159.13] + - [827, 8159.13] - - [4096, 3230, 1, 1024] - - [628, 9601.25] + - [806, 9601.25] - - [4096, 3427, 1, 1024] - - [628, 9466.57] + - [806, 9466.57] - - [1024, 3796, 1, 1024] - - [632, 8739.78] + - [810, 8739.78] - - [143, 148, 432, 64] - - [673, 4762.0] + - [851, 4762.0] - - [1024, 3616, 1, 4096] - - [631, 8445.89] + - [809, 8445.89] - - [1024, 3315, 1, 4096] - - [648, 8403.21] + - [826, 8403.21] - - [1024, 3476, 1, 4096] - - [646, 8523.68] + - [824, 8523.68] - - [1024, 3509, 1, 4096] - - [646, 8345.05] + - [824, 8345.05] - - [4096, 3357, 1, 1024] - - [628, 9300.16] + - [806, 9300.16] - - [4096, 3406, 1, 1024] - - [628, 9427.44] + - [806, 9427.44] - - [1024, 3558, 1, 4096] - - [647, 8525.78] + - [825, 8525.78] - - [4096, 3593, 1, 1024] - - [628, 9302.2] + - [806, 9302.2] - - [4096, 3247, 1, 1024] - - [628, 9648.5] + - [806, 9648.5] - - [4096, 3088, 1, 1024] - - [628, 9204.21] + - [806, 9204.21] - - [1024, 3213, 1, 4096] - - [646, 8054.31] + - [824, 8054.31] - - [4096, 3511, 1, 1024] - - [626, 9702.7] + - [804, 9702.7] - - [122, 122, 528, 64] - - [667, 6293.39] + - [845, 6293.39] - - [1024, 3365, 1, 4096] - - [643, 8413.62] + - [821, 8413.62] - - [1024, 3504, 1, 4096] - - [645, 8414.46] + - [823, 8414.46] - - [1024, 3442, 1, 4096] - - [648, 8684.0] + - [826, 8684.0] - - [4096, 3474, 1, 1024] - - [626, 9611.6] + - [804, 9611.6] - - [4096, 2984, 1, 1024] - - [627, 9592.82] + - [805, 9592.82] - - [1024, 3876, 1, 4096] - - [630, 9085.95] + - [808, 9085.95] - - [4096, 3337, 1, 1024] - - [628, 9246.22] + - [806, 9246.22] - - [4096, 3450, 1, 1024] - - [628, 9534.63] + - [806, 9534.63] - - [1024, 3547, 1, 4096] - - [648, 8386.73] + - [826, 8386.73] - - [4096, 3291, 1, 1024] - - [627, 9759.34] + - [805, 9759.34] - - [1024, 3340, 1, 4096] - - [647, 8237.97] + - [825, 8237.97] - - [4096, 3491, 1, 1024] - - [628, 9656.59] + - [806, 9656.59] - - [4096, 3348, 1, 1024] - - [628, 9279.15] + - [806, 9279.15] - - [78, 78, 816, 64] - - [668, 3591.09] + - [846, 3591.09] - - [4096, 3968, 1, 1024] - - [629, 9642.19] + - [807, 9642.19] - - [4096, 3906, 1, 1024] - - [629, 9485.37] + - [807, 9485.37] - - [1024, 3477, 1, 4096] - - [636, 8389.2] + - [814, 8389.2] - - [1024, 3397, 1, 4096] - - [646, 8556.88] + - [824, 8556.88] - - [4096, 3165, 1, 1024] - - [627, 9415.52] + - [805, 9415.52] - - [4096, 3470, 1, 1024] - - [626, 9598.5] + - [804, 9598.5] - - [1024, 3526, 1, 4096] - - [646, 8442.15] + - [824, 8442.15] - - [112, 112, 576, 64] - - [661, 5672.6] + - [839, 5672.6] - - [4096, 3365, 1, 1024] - - [626, 9321.83] + - [804, 9321.83] - - [4096, 3319, 1, 1024] - - [626, 9838.48] + - [804, 9838.48] - - [1024, 3401, 1, 4096] - - [648, 8460.86] + - [826, 8460.86] - - [1024, 3294, 1, 4096] - - [647, 8324.63] + - [825, 8324.63] - - [159, 159, 400, 64] - - [663, 5488.51] + - [841, 5488.51] - - [1024, 3472, 1, 4096] - - [641, 8289.77] + - [819, 8289.77] - - [4096, 3328, 1, 1024] - - [627, 9904.35] + - [805, 9904.35] - - [1024, 3861, 1, 1024] - - [632, 8917.63] + - [810, 8917.63] - - [1024, 3910, 1, 1024] - - [630, 9010.16] + - [808, 9010.16] - - [1024, 3410, 1, 4096] - - [648, 8519.63] + - [826, 8519.63] - - [1024, 3395, 1, 4096] - - [646, 8424.35] + - [824, 8424.35] - - [4096, 3282, 1, 1024] - - [626, 9743.67] + - [804, 9743.67] - - [1024, 3751, 1, 1024] - - [633, 8680.39] + - [811, 8680.39] - - [4096, 3145, 1, 1024] - - [628, 9353.37] + - [806, 9353.37] - - [4096, 3514, 1, 1024] - - [628, 9713.04] + - [806, 9713.04] - - [4096, 3944, 1, 1024] - - [628, 9563.92] + - [806, 9563.92] - - [1024, 3515, 1, 4096] - - [647, 8428.13] + - [825, 8428.13] - - [4096, 3409, 1, 1024] - - [627, 9428.77] + - [805, 9428.77] - - [4096, 3564, 1, 1024] - - [626, 9823.79] + - [804, 9823.79] - - [4096, 3299, 1, 1024] - - [628, 9793.03] + - [806, 9793.03] - - [1024, 3057, 1, 4096] - - [624, 9237.85] + - [802, 9237.85] - - [4096, 3531, 1, 1024] - - [626, 9745.64] + - [804, 9745.64] - - [4096, 3388, 1, 1024] - - [628, 9374.65] + - [806, 9374.65] - - [1024, 3189, 1, 4096] - - [648, 8084.6] + - [826, 8084.6] - - [1024, 3300, 1, 4096] - - [648, 8185.13] + - [826, 8185.13] - - [1024, 3720, 1, 4096] - - [627, 8755.11] + - [805, 8755.11] - - [1024, 3383, 1, 4096] - - [641, 8463.47] + - [819, 8463.47] - - [1024, 3494, 1, 4096] - - [648, 8676.57] + - [826, 8676.57] - - [77, 78, 816, 64] - - [664, 3548.26] + - [842, 3548.26] - - [1024, 3448, 1, 4096] - - [646, 8665.78] + - [824, 8665.78] - - [4096, 3542, 1, 1024] - - [626, 9771.88] + - [804, 9771.88] - - [1024, 3488, 1, 4096] - - [646, 8488.39] + - [824, 8488.39] - - [4096, 3405, 1, 1024] - - [628, 9426.16] + - [806, 9426.16] - - [1024, 3262, 1, 4096] - - [648, 8206.97] + - [826, 8206.97] - - [33708, 4005, 1, 1024] - - [629, 9928.16] + - [807, 9928.16] - - [1024, 3594, 1, 4096] - - [633, 8458.57] + - [811, 8458.57] - - [4096, 3103, 1, 1024] - - [629, 9243.14] + - [807, 9243.14] - - [4096, 3136, 1, 1024] - - [628, 9340.9] + - [806, 9340.9] - - [1024, 3378, 1, 4096] - - [649, 8432.45] + - [827, 8432.45] - - [10, 10, 5952, 64] - - [669, 523.353] + - [847, 523.353] - - [7, 7, 8192, 64] - - [669, 260.543] + - [847, 260.543] - - [4096, 3559, 1, 1024] - - [628, 9813.1] + - [806, 9813.1] - - [4096, 3368, 1, 1024] - - [629, 9328.66] + - [807, 9328.66] - - [4096, 3209, 1, 1024] - - [626, 9538.83] + - [804, 9538.83] - - [4096, 3322, 1, 1024] - - [628, 9839.58] + - [806, 9839.58] - - [1024, 3483, 1, 4096] - - [634, 8348.35] + - [812, 8348.35] - - [4096, 3473, 1, 1024] - - [627, 9605.79] + - [805, 9605.79] - - [4096, 3522, 1, 1024] - - [629, 9730.02] + - [807, 9730.02] - - [1024, 3532, 1, 4096] - - [647, 8474.32] + - [825, 8474.32] - - [4096, 3449, 1, 1024] - - [628, 9528.35] + - [806, 9528.35] - - [1024, 3351, 1, 4096] - - [649, 8311.23] + - [827, 8311.23] - - [1024, 3462, 1, 4096] - - [646, 8297.64] + - [824, 8297.64] - - [4096, 3396, 1, 1024] - - [628, 9400.25] + - [806, 9400.25] - - [132, 132, 480, 64] - - [674, 4089.84] + - [852, 4089.84] - - [111, 112, 576, 64] - - [660, 5529.7] + - [838, 5529.7] - - [1024, 3416, 1, 4096] - - [647, 8556.64] + - [825, 8556.64] - - [4096, 3469, 1, 1024] - - [629, 9598.77] + - [807, 9598.77] - - [1024, 3582, 1, 4096] - - [630, 8461.47] + - [808, 8461.47] - - [1024, 3230, 1, 4096] - - [647, 8188.94] + - [825, 8188.94] - - [1024, 3489, 1, 4096] - - [648, 8457.85] + - [826, 8457.85] - - [1024, 3427, 1, 4096] - - [648, 8566.59] + - [826, 8566.59] - - [1024, 3346, 1, 4096] - - [647, 8352.17] + - [825, 8352.17] - - [33708, 3977, 1, 1024] - - [629, 9868.5] + - [807, 9868.5] - - [4096, 3796, 1, 1024] - - [628, 9797.76] + - [806, 9797.76] - - [4096, 3176, 1, 1024] - - [628, 9435.39] + - [806, 9435.39] - - [4096, 3990, 1, 1024] - - [626, 9672.33] + - [804, 9672.33] - - [1024, 3257, 1, 4096] - - [649, 8225.17] + - [827, 8225.17] - - [4096, 3343, 1, 1024] - - [650, 9273.62] + - [828, 9273.62] - - [4096, 3440, 1, 1024] - - [626, 9501.48] + - [804, 9501.48] - - [33708, 4030, 1, 1024] - - [627, 9983.36] + - [805, 9983.36] - - [1024, 3190, 1, 4096] - - [648, 8192.11] + - [826, 8192.11] - - [1024, 3389, 1, 4096] - - [649, 8439.42] + - [827, 8439.42] - - [1024, 3500, 1, 4096] - - [647, 8556.12] + - [825, 8556.12] - - [1024, 3471, 1, 4096] - - [636, 8491.17] + - [814, 8491.17] - - [1024, 3438, 1, 4096] - - [649, 8567.95] + - [827, 8567.95] - - [4096, 3513, 1, 1024] - - [626, 9710.27] + - [804, 9710.27] - - [1024, 3562, 1, 4096] - - [641, 8608.94] + - [819, 8608.94] - - [4096, 3616, 1, 1024] - - [628, 9357.59] + - [806, 9357.59] - - [4096, 3955, 1, 1024] - - [627, 9589.71] + - [805, 9589.71] - - [1024, 3441, 1, 4096] - - [637, 8359.27] + - [815, 8359.27] - - [1024, 3236, 1, 4096] - - [651, 8022.6] + - [829, 8022.6] - - [1024, 3524, 1, 4096] - - [646, 8477.24] + - [824, 8477.24] - - [4096, 3460, 1, 1024] - - [626, 9581.96] + - [804, 9581.96] - - [16, 16, 3840, 64] - - [658, 1270.59] + - [836, 1270.59] - - [92, 93, 688, 64] - - [662, 4962.4] + - [840, 4962.4] - - [1024, 3384, 1, 4096] - - [637, 8409.39] + - [815, 8409.39] - - [4096, 3387, 1, 1024] - - [628, 9379.8] + - [806, 9379.8] - - [4096, 3436, 1, 1024] - - [626, 9491.93] + - [804, 9491.93] - - [4096, 3277, 1, 1024] - - [626, 9717.27] + - [804, 9717.27] - - [1024, 3457, 1, 4096] - - [646, 8279.22] + - [824, 8279.22] - - [1024, 3999, 1, 4096] - - [621, 9231.47] + - [799, 9231.47] - - [1024, 4032, 1, 4096] - - [630, 9443.62] + - [808, 9443.62] - - [4096, 3541, 1, 1024] - - [626, 9773.24] + - [804, 9773.24] - - [4096, 3334, 1, 1024] - - [626, 9242.79] + - [804, 9242.79] - - [1024, 3393, 1, 4096] - - [648, 8376.17] + - [826, 8376.17] - - [17, 17, 3632, 64] - - [670, 1425.77] + - [848, 1425.77] - - [1024, 3411, 1, 4096] - - [636, 8490.97] + - [814, 8490.97] - - [1024, 3822, 1, 1024] - - [633, 8773.44] + - [811, 8773.44] - - [1024, 3593, 1, 4096] - - [633, 8571.25] + - [811, 8571.25] - - [33708, 3822, 1, 1024] - - [627, 10056.8] + - [805, 10056.8] - - [4096, 3504, 1, 1024] - - [629, 9680.29] + - [807, 9680.29] - - [1024, 3163, 1, 4096] - - [648, 8014.43] + - [826, 8014.43] - - [1024, 3357, 1, 4096] - - [649, 8376.04] + - [827, 8376.04] - - [1024, 3906, 1, 4096] - - [630, 9108.22] + - [808, 9108.22] - - [4096, 3415, 1, 1024] - - [626, 9443.87] + - [804, 9443.87] - - [1024, 3406, 1, 4096] - - [649, 8451.64] + - [827, 8451.64] - - [4096, 3321, 1, 1024] - - [628, 9836.62] + - [806, 9836.62] - - [4096, 3584, 1, 1024] - - [629, 9915.93] + - [807, 9915.93] - - [1024, 2736, 1, 4096] - - [632, 8532.93] + - [810, 8532.93] - - [1024, 3110, 1, 4096] - - [649, 7889.29] + - [827, 7889.29] - - [33708, 3999, 1, 1024] - - [629, 9903.33] + - [807, 9903.33] - - [1024, 3093, 1, 4096] - - [647, 7919.35] + - [825, 7919.35] - - [4096, 3378, 1, 1024] - - [629, 9362.3] + - [807, 9362.3] - - [1024, 3543, 1, 4096] - - [643, 8438.16] + - [821, 8438.16] - - [33708, 3925, 1, 1024] - - [628, 10021.6] + - [806, 10021.6] - - [1024, 3352, 1, 4096] - - [649, 8333.82] + - [827, 8333.82] - - [4096, 3780, 1, 1024] - - [626, 9755.02] + - [804, 9755.02] - - [1024, 3990, 1, 4096] - - [623, 9251.02] + - [801, 9251.02] - - [4096, 3500, 1, 1024] - - [626, 9673.83] + - [804, 9673.83] - - [4096, 3996, 1, 1024] - - [627, 9694.5] + - [805, 9694.5] - - [1024, 3247, 1, 4096] - - [652, 8171.58] + - [830, 8171.58] - - [4096, 3395, 1, 1024] - - [628, 9392.04] + - [806, 9392.04] - - [1024, 3169, 1, 4096] - - [647, 7990.24] + - [825, 7990.24] - - [1024, 3088, 1, 4096] - - [647, 7890.36] + - [825, 7890.36] - - [1024, 3584, 1, 4096] - - [649, 8604.2] + - [827, 8604.2] - - [4096, 3093, 1, 1024] - - [628, 9224.88] + - [806, 9224.88] - - [1024, 3538, 1, 4096] - - [630, 8395.74] + - [808, 8395.74] - - [1024, 3996, 1, 1024] - - [631, 9208.33] + - [809, 9208.33] - - [1024, 3581, 1, 4096] - - [643, 8523.24] + - [821, 8523.24] - - [4096, 3374, 1, 1024] - - [628, 9342.81] + - [806, 9342.81] - - [33708, 3751, 1, 1024] - - [628, 9881.99] + - [806, 9881.99] - - [59, 59, 1088, 64] - - [666, 4515.54] + - [844, 4515.54] - - [4096, 3215, 1, 1024] - - [628, 9557.75] + - [806, 9557.75] - - [4096, 3312, 1, 1024] - - [626, 9834.4] + - [804, 9834.4] - - [4096, 3581, 1, 1024] - - [628, 9856.66] + - [806, 9856.66] - - [4096, 3479, 1, 1024] - - [628, 9620.35] + - [806, 9620.35] - - [4096, 3544, 1, 1024] - - [626, 9778.94] + - [804, 9778.94] - - [1024, 3870, 1, 1024] - - [631, 8935.26] + - [809, 8935.26] - - [1024, 3374, 1, 4096] - - [648, 8412.85] + - [826, 8412.85] - - [1024, 2967, 1, 4096] - - [631, 8982.97] + - [809, 8982.97] - - [41, 41, 1552, 64] - - [660, 2805.38] + - [838, 2805.38] - - [4096, 3455, 1, 1024] - - [626, 9538.89] + - [804, 9538.89] - - [4096, 3942, 1, 1024] - - [627, 9554.65] + - [805, 9554.65] - - [1024, 3528, 1, 4096] - - [646, 8438.47] + - [824, 8438.47] - - [4096, 3186, 1, 1024] - - [627, 9468.32] + - [805, 9468.32] - - [1024, 3976, 1, 1024] - - [631, 9167.08] + - [809, 9167.08] - - [1024, 3511, 1, 4096] - - [633, 8335.06] + - [811, 8335.06] - - [4096, 3573, 1, 1024] - - [626, 9855.33] + - [804, 9855.33] - - [4096, 3561, 1, 1024] - - [626, 9831.03] + - [804, 9831.03] - - [4096, 3418, 1, 1024] - - [627, 9450.68] + - [805, 9450.68] - - [33708, 3906, 1, 1024] - - [629, 9973.67] + - [807, 9973.67] - - [4096, 3259, 1, 1024] - - [626, 9685.26] + - [804, 9685.26] - - [4096, 3308, 1, 1024] - - [628, 9792.03] + - [806, 9792.03] - - [1024, 3419, 1, 4096] - - [648, 8514.53] + - [826, 8514.53] - - [1024, 3215, 1, 4096] - - [647, 8137.53] + - [825, 8137.53] - - [1024, 4030, 1, 4096] - - [629, 9290.76] + - [807, 9290.76] - - [4096, 3459, 1, 1024] - - [626, 9567.57] + - [804, 9567.57] - - [1024, 3572, 1, 4096] - - [646, 8501.43] + - [824, 8501.43] - - [1024, 3137, 1, 4096] - - [648, 7930.15] + - [826, 7930.15] - - [1024, 3312, 1, 4096] - - [649, 8378.6] + - [827, 8378.6] - - [1024, 3925, 1, 4096] - - [631, 9255.86] + - [809, 9255.86] - - [1024, 3453, 1, 4096] - - [648, 8630.76] + - [826, 8630.76] - - [4096, 3435, 1, 1024] - - [627, 9495.18] + - [805, 9495.18] - - [1024, 3176, 1, 4096] - - [648, 8087.23] + - [826, 8087.23] - - [1024, 3444, 1, 4096] - - [640, 8528.58] + - [818, 8528.58] - - [4096, 3975, 1, 1024] - - [629, 9645.34] + - [807, 9645.34] - - [4096, 3182, 1, 1024] - - [628, 9448.4] + - [806, 9448.4] - - [1024, 3475, 1, 4096] - - [647, 8404.87] + - [825, 8404.87] - - [9, 9, 6544, 64] - - [662, 425.854] + - [840, 425.854] - - [33708, 3955, 1, 1024] - - [629, 10088.4] + - [807, 10088.4] - - [4096, 3446, 1, 1024] - - [628, 9520.06] + - [806, 9520.06] - - [1024, 3138, 1, 4096] - - [647, 8053.44] + - [825, 8053.44] - - [1024, 3549, 1, 4096] - - [633, 8426.42] + - [811, 8426.42] - - [4096, 3287, 1, 1024] - - [629, 9751.34] + - [807, 9751.34] - - [1024, 3342, 1, 4096] - - [646, 8320.01] + - [824, 8320.01] - - [102, 102, 624, 64] - - [661, 4747.52] + - [839, 4747.52] - - [4096, 3519, 1, 1024] - - [628, 9716.1] + - [806, 9716.1] - - [4096, 3552, 1, 1024] - - [626, 9806.69] + - [804, 9806.69] - - [4096, 3859, 1, 1024] - - [626, 9369.94] + - [804, 9369.94] - - [33708, 3969, 1, 1024] - - [626, 9830.39] + - [804, 9830.39] - - [1024, 3369, 1, 4096] - - [647, 8379.26] + - [825, 8379.26] - - [4096, 3482, 1, 1024] - - [626, 9631.7] + - [804, 9631.7] - - [1024, 3306, 1, 4096] - - [649, 8320.06] + - [827, 8320.06] - - [1024, 3474, 1, 4096] - - [648, 8498.9] + - [826, 8498.9] - - [99, 99, 624, 64] - - [660, 4492.9] + - [838, 4492.9] - - [4096, 3377, 1, 1024] - - [626, 9369.92] + - [804, 9369.92] - - [4096, 3426, 1, 1024] - - [626, 9467.3] + - [804, 9467.3] - - [4096, 2935, 1, 1024] - - [627, 9423.74] + - [805, 9423.74] - - [4096, 3267, 1, 1024] - - [626, 9698.04] + - [804, 9698.04] - - [1024, 3299, 1, 4096] - - [647, 8264.76] + - [825, 8264.76] - - [1024, 3456, 1, 4096] - - [646, 8678.39] + - [824, 8678.39] - - [1024, 3280, 1, 4096] - - [647, 8220.69] + - [825, 8220.69] - - [1024, 3555, 1, 4096] - - [646, 8656.27] + - [824, 8656.27] - - [4096, 3499, 1, 1024] - - [628, 9663.93] + - [806, 9663.93] - - [4096, 3356, 1, 1024] - - [628, 9296.9] + - [806, 9296.9] - - [100, 102, 624, 64] - - [661, 4671.51] + - [839, 4671.51] - - [1024, 3412, 1, 4096] - - [649, 8538.05] + - [827, 8538.05] - - [1024, 2984, 1, 4096] - - [632, 9193.17] + - [810, 9193.17] - - [4096, 3141, 1, 1024] - - [628, 9349.43] + - [806, 9349.43] - - [4096, 3510, 1, 1024] - - [626, 9701.98] + - [804, 9701.98] - - [1024, 3995, 1, 1024] - - [630, 9243.4] + - [808, 9243.4] - - [1024, 3517, 1, 4096] - - [648, 8569.31] + - [826, 8569.31] - - [1024, 3455, 1, 4096] - - [648, 8560.67] + - [826, 8560.67] - - [1024, 3939, 1, 1024] - - [631, 9030.94] + - [809, 9030.94] - - [38, 38, 1680, 64] - - [660, 2459.84] + - [838, 2459.84] - - [1024, 3447, 1, 4096] - - [646, 8610.02] + - [824, 8610.02] - - [1024, 3969, 1, 4096] - - [633, 9097.33] + - [811, 9097.33] - - [4096, 3527, 1, 1024] - - [628, 9743.83] + - [806, 9743.83] - - [4096, 3336, 1, 1024] - - [628, 9248.33] + - [806, 9248.33] - - [1024, 3191, 1, 4096] - - [646, 8104.96] + - [824, 8104.96] - - [1024, 3302, 1, 4096] - - [647, 8245.09] + - [825, 8245.09] - - [1024, 3337, 1, 4096] - - [649, 8254.25] + - [827, 8254.25] - - [4096, 3290, 1, 1024] - - [628, 9759.13] + - [806, 9759.13] - - [1024, 3512, 1, 4096] - - [637, 8641.06] + - [815, 8641.06] - - [1024, 3433, 1, 4096] - - [647, 8444.7] + - [825, 8444.7] - - [4096, 3876, 1, 1024] - - [627, 9420.38] + - [805, 9420.38] - - [4096, 3490, 1, 1024] - - [628, 9641.11] + - [806, 9641.11] - - [4096, 3064, 1, 1024] - - [628, 9820.49] + - [806, 9820.49] - - [1024, 3508, 1, 4096] - - [643, 8442.24] + - [821, 8442.24] - - [1024, 3956, 1, 4096] - - [628, 9128.19] + - [806, 9128.19] - - [4096, 3417, 1, 1024] - - [628, 9448.41] + - [806, 9448.41] - - [1024, 3248, 1, 4096] - - [647, 8006.16] + - [825, 8006.16] - - [1024, 2499, 1, 4096] - - [647, 8155.19] + - [825, 8155.19] - - [1024, 3186, 1, 4096] - - [647, 8093.04] + - [825, 8093.04] - - [1024, 3180, 1, 4096] - - [649, 8097.02] + - [827, 8097.02] - - [4096, 3364, 1, 1024] - - [628, 9318.08] + - [806, 9318.08] - - [4096, 3976, 1, 1024] - - [628, 9654.47] + - [806, 9654.47] - - [4096, 3205, 1, 1024] - - [629, 9538.84] + - [807, 9538.84] - - [4096, 3318, 1, 1024] - - [626, 9838.29] + - [804, 9838.29] - - [1024, 3377, 1, 4096] - - [649, 8445.64] + - [827, 8445.64] - - [1024, 3485, 1, 4096] - - [646, 8368.83] + - [824, 8368.83] - - [4096, 3181, 1, 1024] - - [629, 9458.29] + - [807, 9458.29] - - [4096, 3550, 1, 1024] - - [626, 9783.14] + - [804, 9783.14] - - [1024, 3534, 1, 4096] - - [635, 8684.99] + - [813, 8684.99] - - [1024, 3860, 1, 1024] - - [630, 8923.18] + - [808, 8923.18] - - [160, 160, 400, 64] - - [673, 5797.69] + - [851, 5797.69] - - [4096, 3445, 1, 1024] - - [628, 9511.28] + - [806, 9511.28] - - [1024, 3391, 1, 4096] - - [649, 8541.77] + - [827, 8541.77] - - [1024, 3221, 1, 4096] - - [647, 8055.5] + - [825, 8055.5] - - [4096, 3079, 1, 1024] - - [626, 9181.04] + - [804, 9181.04] - - [4096, 3144, 1, 1024] - - [628, 9351.45] + - [806, 9351.45] - - [1024, 3270, 1, 4096] - - [648, 8367.63] + - [826, 8367.63] - - [1024, 3561, 1, 4096] - - [648, 8426.29] + - [826, 8426.29] - - [1024, 3480, 1, 4096] - - [635, 8465.0] + - [813, 8465.0] - - [4096, 3408, 1, 1024] - - [628, 9420.04] + - [806, 9420.04] - - [1024, 3418, 1, 4096] - - [649, 8481.02] + - [827, 8481.02] - - [4096, 3298, 1, 1024] - - [629, 9788.4] + - [807, 9788.4] - - [1024, 3640, 1, 1024] - - [632, 8435.44] + - [810, 8435.44] - - [1024, 3449, 1, 4096] - - [647, 8590.87] + - [825, 8590.87] - - [1024, 4020, 1, 4096] - - [625, 9168.13] + - [803, 9168.13] - - [4096, 3481, 1, 1024] - - [626, 9627.91] + - [804, 9627.91] - - [4096, 3530, 1, 1024] - - [628, 9734.68] + - [806, 9734.68] - - [1024, 3216, 1, 4096] - - [649, 8014.32] + - [827, 8014.32] - - [1024, 3840, 1, 1024] - - [632, 8908.37] + - [810, 8908.37] - - [1024, 3491, 1, 4096] - - [635, 8410.59] + - [813, 8410.59] - - [1024, 3154, 1, 4096] - - [648, 8095.69] + - [826, 8095.69] - - [4096, 3425, 1, 1024] - - [628, 9474.53] + - [806, 9474.53] - - [1024, 3348, 1, 4096] - - [646, 8202.9] + - [824, 8202.9] - - [1024, 3415, 1, 4096] - - [647, 8597.68] + - [825, 8597.68] - - [1024, 4026, 1, 1024] - - [630, 9279.09] + - [808, 9279.09] - - [1024, 3367, 1, 4096] - - [649, 8335.54] + - [827, 8335.54] - - [1024, 3259, 1, 4096] - - [649, 8285.3] + - [827, 8285.3] - - [1024, 3894, 1, 4096] - - [632, 9040.44] + - [810, 9040.44] - - [4096, 3355, 1, 1024] - - [627, 9291.67] + - [805, 9291.67] - - [4096, 3404, 1, 1024] - - [628, 9410.47] + - [806, 9410.47] - - [1024, 3308, 1, 4096] - - [649, 8336.3] + - [827, 8336.3] - - [4096, 3245, 1, 1024] - - [627, 9641.47] + - [805, 9641.47] - - [1024, 3502, 1, 4096] - - [648, 8375.9] + - [826, 8375.9] - - [33708, 4032, 1, 1024] - - [627, 9988.2] + - [805, 9988.2] - - [8, 8, 7280, 64] - - [664, 339.878] + - [842, 339.878] - - [1024, 3424, 1, 4096] - - [635, 8489.48] + - [813, 8489.48] - - [4096, 3509, 1, 1024] - - [627, 9702.29] + - [805, 9702.29] - - [4096, 3558, 1, 1024] - - [628, 9815.51] + - [806, 9815.51] - - [1024, 3900, 1, 1024] - - [631, 9014.05] + - [809, 9014.05] - - [1024, 2505, 1, 4096] - - [645, 8263.75] + - [823, 8263.75] - - [4096, 3472, 1, 1024] - - [626, 9609.61] + - [804, 9609.61] - - [1024, 3386, 1, 4096] - - [646, 8417.55] + - [824, 8417.55] - - [4096, 3383, 1, 1024] - - [628, 9364.77] + - [806, 9364.77] - - [4096, 3448, 1, 1024] - - [629, 9521.07] + - [807, 9521.07] - - [4096, 4030, 1, 1024] - - [629, 9771.56] + - [807, 9771.56] - - [4096, 3289, 1, 1024] - - [626, 9757.27] + - [804, 9757.27] - - [1024, 3459, 1, 4096] - - [648, 8422.12] + - [826, 8422.12] - - [1024, 2918, 1, 4096] - - [633, 9022.71] + - [811, 9022.71] - - [4096, 3489, 1, 1024] - - [626, 9641.9] + - [804, 9641.9] - - [4096, 3346, 1, 1024] - - [628, 9271.65] + - [806, 9271.65] - - [4096, 3572, 1, 1024] - - [628, 9829.82] + - [806, 9829.82] - - [1024, 3955, 1, 4096] - - [629, 9221.66] + - [807, 9221.66] - - [4096, 3236, 1, 1024] - - [626, 9620.72] + - [804, 9620.72] - - [4096, 3163, 1, 1024] - - [626, 9397.3] + - [804, 9397.3] - - [4096, 3468, 1, 1024] - - [626, 9601.58] + - [804, 9601.58] - - [1024, 3165, 1, 4096] - - [648, 7941.58] + - [826, 7941.58] - - [1024, 3276, 1, 4096] - - [648, 8244.96] + - [826, 8244.96] - - [1024, 3359, 1, 4096] - - [646, 8273.93] + - [824, 8273.93] - - [4096, 3363, 1, 1024] - - [628, 9315.8] + - [806, 9315.8] - - [1024, 3385, 1, 4096] - - [640, 8286.2] + - [818, 8286.2] - - [1024, 3207, 1, 4096] - - [649, 8144.02] + - [827, 8144.02] - - [1024, 3458, 1, 4096] - - [648, 8472.41] + - [826, 8472.41] - - [21, 21, 2976, 64] - - [664, 2083.3] + - [842, 2083.3] - - [4096, 3110, 1, 1024] - - [626, 9260.3] + - [804, 9260.3] - - [4096, 3925, 1, 1024] - - [629, 9526.66] + - [807, 9526.66] - - [1024, 3975, 1, 4096] - - [624, 9133.84] + - [802, 9133.84] - - [4096, 3549, 1, 1024] - - [628, 9793.77] + - [806, 9793.77] - - [4096, 3342, 1, 1024] - - [627, 9264.48] + - [805, 9264.48] - - [1024, 3859, 1, 1024] - - [630, 8933.47] + - [808, 8933.47] - - [1024, 3497, 1, 4096] - - [647, 8526.13] + - [825, 8526.13] - - [4096, 3280, 1, 1024] - - [628, 9733.32] + - [806, 9733.32] - - [1024, 3435, 1, 4096] - - [647, 8489.85] + - [825, 8489.85] - - [1024, 3354, 1, 4096] - - [647, 8248.83] + - [825, 8248.83] - - [4096, 3191, 1, 1024] - - [627, 9475.12] + - [805, 9475.12] - - [4096, 3512, 1, 1024] - - [626, 9701.37] + - [804, 9701.37] - - [1024, 3055, 1, 4096] - - [633, 9264.91] + - [811, 9264.91] - - [4096, 2499, 1, 1024] - - [628, 9574.06] + - [806, 9574.06] - - [1024, 3233, 1, 4096] - - [646, 8101.74] + - [824, 8101.74] - - [4096, 3423, 1, 1024] - - [629, 9463.5] + - [807, 9463.5] - - [1024, 3319, 1, 4096] - - [649, 8413.76] + - [827, 8413.76] - - [4096, 3297, 1, 1024] - - [626, 9782.66] + - [804, 9782.66] - - [4096, 3154, 1, 1024] - - [628, 9381.2] + - [806, 9381.2] - - [1024, 3540, 1, 4096] - - [649, 8507.53] + - [827, 8507.53] - - [1024, 3289, 1, 4096] - - [649, 8233.8] + - [827, 8233.8] - - [4096, 3529, 1, 1024] - - [628, 9741.15] + - [806, 9741.15] - - [4096, 3386, 1, 1024] - - [628, 9372.57] + - [806, 9372.57] - - [4096, 3276, 1, 1024] - - [626, 9713.76] + - [804, 9713.76] - - [1024, 3244, 1, 4096] - - [649, 8146.83] + - [827, 8146.83] - - [1024, 3182, 1, 4096] - - [646, 8115.12] + - [824, 8115.12] - - [4096, 3540, 1, 1024] - - [626, 9768.42] + - [804, 9768.42] - - [1024, 3360, 1, 4096] - - [648, 8353.31] + - [826, 8353.31] - - [1024, 3942, 1, 4096] - - [627, 9143.78] + - [805, 9143.78] - - [4096, 3403, 1, 1024] - - [629, 9412.18] + - [807, 9412.18] - - [4096, 3101, 1, 1024] - - [629, 9239.28] + - [807, 9239.28] - - [4096, 2918, 1, 1024] - - [628, 9373.75] + - [806, 9373.75] - - [1024, 3465, 1, 4096] - - [649, 8288.16] + - [827, 8288.16] - - [33708, 3780, 1, 1024] - - [628, 9971.91] + - [806, 9971.91] - - [4096, 3557, 1, 1024] - - [626, 9814.82] + - [804, 9814.82] - - [4096, 3414, 1, 1024] - - [626, 9436.63] + - [804, 9436.63] - - [1024, 3948, 1, 1024] - - [630, 9073.8] + - [808, 9073.8] - - [4096, 3320, 1, 1024] - - [628, 9834.77] + - [806, 9834.77] - - [4096, 2765, 1, 1024] - - [628, 9667.06] + - [806, 9667.06] - - [1024, 3978, 1, 4096] - - [623, 9109.6] + - [801, 9109.6] - - [4096, 3487, 1, 1024] - - [626, 9644.0] + - [804, 9644.0] - - [4096, 3520, 1, 1024] - - [628, 9728.08] + - [806, 9728.08] - - [1024, 3139, 1, 4096] - - [648, 7940.19] + - [826, 7940.19] - - [1024, 3314, 1, 4096] - - [646, 8294.01] + - [824, 8294.01] - - [4096, 3431, 1, 1024] - - [628, 9482.12] + - [806, 9482.12] - - [123, 122, 528, 64] - - [661, 6325.98] + - [839, 6325.98] - - [1024, 3446, 1, 4096] - - [642, 8468.34] + - [820, 8468.34] - - [1024, 4059, 1, 4096] - - [629, 9370.8] + - [807, 9370.8] - - [99, 102, 624, 64] - - [661, 4624.8] + - [839, 4624.8] - - [4096, 3345, 1, 1024] - - [626, 9271.32] + - [804, 9271.32] - - [4096, 3394, 1, 1024] - - [626, 9398.19] + - [804, 9398.19] - - [1024, 3927, 1, 1024] - - [631, 9041.38] + - [809, 9041.38] - - [4096, 3235, 1, 1024] - - [626, 9619.93] + - [804, 9619.93] - - [1024, 3328, 1, 4096] - - [647, 8406.09] + - [825, 8406.09] - - [33708, 3956, 1, 1024] - - [627, 10100.4] + - [805, 10100.4] - - [4096, 3467, 1, 1024] - - [628, 9586.66] + - [806, 9586.66] - - [1024, 3287, 1, 4096] - - [648, 8273.83] + - [826, 8273.83] - - [4096, 3214, 1, 1024] - - [629, 9557.49] + - [807, 9557.49] - - [4096, 3910, 1, 1024] - - [626, 9490.25] + - [804, 9490.25] - - [1024, 3780, 1, 1024] - - [633, 8706.0] + - [811, 8706.0] - - [1024, 3371, 1, 4096] - - [649, 8248.46] + - [827, 8248.46] - - [4096, 3478, 1, 1024] - - [629, 9619.62] + - [807, 9619.62] - - [1024, 3546, 1, 4096] - - [647, 8456.83] + - [825, 8456.83] - - [1024, 4012, 1, 1024] - - [630, 9253.34] + - [808, 9253.34] - - [4096, 3341, 1, 1024] - - [628, 9260.24] + - [806, 9260.24] - - [4096, 3454, 1, 1024] - - [626, 9533.62] + - [804, 9533.62] - - [4096, 3295, 1, 1024] - - [629, 9772.86] + - [807, 9772.86] - - [4096, 3072, 1, 1024] - - [626, 9887.23] + - [804, 9887.23] - - [1024, 3282, 1, 4096] - - [634, 8112.85] + - [812, 8112.85] - - [33708, 3720, 1, 1024] - - [629, 9818.85] + - [807, 9818.85] - - [1024, 3681, 1, 4096] - - [631, 8639.28] + - [809, 8639.28] - - [1024, 4050, 1, 4096] - - [629, 9291.93] + - [807, 9291.93] - - [4096, 3495, 1, 1024] - - [628, 9660.52] + - [806, 9660.52] - - [4096, 3560, 1, 1024] - - [627, 9813.8] + - [805, 9813.8] - - [4096, 3751, 1, 1024] - - [626, 9684.95] + - [804, 9684.95] - - [1024, 3414, 1, 4096] - - [647, 8555.72] + - [825, 8555.72] - - [33708, 3860, 1, 1024] - - [626, 9856.68] + - [804, 9856.68] - - [1024, 3325, 1, 4096] - - [636, 8261.21] + - [814, 8261.21] - - [4096, 3458, 1, 1024] - - [626, 9570.86] + - [804, 9570.86] - - [4096, 2967, 1, 1024] - - [626, 9544.61] + - [804, 9544.61] - - [1024, 3519, 1, 4096] - - [649, 8413.1] + - [827, 8413.1] - - [4096, 3385, 1, 1024] - - [628, 9367.34] + - [806, 9367.34] - - [4096, 3434, 1, 1024] - - [626, 9488.41] + - [804, 9488.41] - - [1024, 3552, 1, 4096] - - [647, 8456.13] + - [825, 8456.13] - - [4096, 3822, 1, 1024] - - [627, 9849.84] + - [805, 9849.84] - - [1024, 3544, 1, 4096] - - [646, 8494.56] + - [824, 8494.56] - - [4096, 3539, 1, 1024] - - [628, 9763.09] + - [806, 9763.09] - - [4096, 3332, 1, 1024] - - [626, 9232.36] + - [804, 9232.36] - - [1024, 3145, 1, 4096] - - [646, 8098.36] + - [824, 8098.36] - - [1024, 3535, 1, 4096] - - [634, 8592.8] + - [812, 8592.8] - - [1024, 3320, 1, 4096] - - [647, 8419.55] + - [825, 8419.55] - - [33708, 4012, 1, 1024] - - [629, 9940.2] + - [807, 9940.2] - - [4096, 3286, 1, 1024] - - [628, 9747.82] + - [806, 9747.82] - - [1024, 3514, 1, 4096] - - [647, 8653.69] + - [825, 8653.69] - - [93, 93, 688, 64] - - [668, 5005.79] + - [846, 5005.79] - - [1024, 2765, 1, 4096] - - [633, 8636.72] + - [811, 8636.72] - - [1024, 3452, 1, 4096] - - [646, 8445.87] + - [824, 8445.87] - - [4096, 3518, 1, 1024] - - [626, 9722.56] + - [804, 9722.56] - - [1024, 3529, 1, 4096] - - [646, 8444.32] + - [824, 8444.32] - - [4096, 3413, 1, 1024] - - [626, 9436.35] + - [804, 9436.35] - - [33708, 4050, 1, 1024] - - [628, 10026.7] + - [806, 10026.7] - - [1024, 3525, 1, 4096] - - [639, 8488.99] + - [817, 8488.99] - - [4096, 3303, 1, 1024] - - [626, 9791.05] + - [804, 9791.05] - - [1024, 3382, 1, 4096] - - [647, 8483.63] + - [825, 8483.63] - - [1024, 3390, 1, 4096] - - [646, 8552.81] + - [824, 8552.81] - - [1024, 3977, 1, 4096] - - [628, 9053.53] + - [806, 9053.53] - - [1024, 3184, 1, 4096] - - [646, 8008.81] + - [824, 8008.81] - - [4096, 3535, 1, 1024] - - [628, 9760.79] + - [806, 9760.79] - - [4096, 3376, 1, 1024] - - [629, 9341.93] + - [807, 9341.93] - - [4096, 3978, 1, 1024] - - [629, 9642.8] + - [807, 9642.8] - - [1024, 3136, 1, 4096] - - [648, 8085.12] + - [826, 8085.12] - - [1024, 3293, 1, 4096] - - [646, 8300.49] + - [824, 8300.49] - - [4096, 3266, 1, 1024] - - [627, 9691.78] + - [805, 9691.78] - - [1024, 3487, 1, 4096] - - [646, 8383.62] + - [824, 8383.62] - - [1024, 3409, 1, 4096] - - [648, 8493.25] + - [826, 8493.25] - - [4096, 3498, 1, 1024] - - [627, 9672.38] + - [805, 9672.38] - - [1024, 3520, 1, 4096] - - [649, 8488.26] + - [827, 8488.26] - - [1024, 3530, 1, 4096] - - [630, 8409.87] + - [808, 8409.87] - - [4096, 3393, 1, 1024] - - [628, 9395.43] + - [806, 9395.43] - - [4096, 3140, 1, 1024] - - [628, 9338.5] + - [806, 9338.5] - - [1024, 3536, 1, 4096] - - [649, 8642.11] + - [827, 8642.11] - - [1024, 3288, 1, 4096] - - [649, 8229.34] + - [827, 8229.34] - - [1024, 4005, 1, 4096] - - [631, 9271.04] + - [809, 9271.04] - - [1024, 3579, 1, 4096] - - [635, 8844.5] + - [813, 8844.5] - - [4096, 3372, 1, 1024] - - [626, 9339.25] + - [804, 9339.25] - - [1024, 3440, 1, 4096] - - [646, 8466.69] + - [824, 8466.69] - - [4096, 3213, 1, 1024] - - [629, 9558.85] + - [807, 9558.85] - - [123, 123, 528, 64] - - [661, 6333.59] + - [839, 6333.59] - - [100, 100, 624, 64] - - [660, 4584.12] + - [838, 4584.12] - - [1024, 3968, 1, 4096] - - [627, 9237.6] + - [805, 9237.6] - - [4096, 3477, 1, 1024] - - [627, 9618.88] + - [805, 9618.88] - - [4096, 3526, 1, 1024] - - [626, 9735.94] + - [804, 9735.94] - - [1024, 3493, 1, 4096] - - [647, 8355.13] + - [825, 8355.13] - - [1024, 3944, 1, 4096] - - [622, 9065.39] + - [800, 9065.39] - - [4096, 3453, 1, 1024] - - [627, 9533.37] + - [805, 9533.37] - - [1024, 3350, 1, 4096] - - [649, 8448.64] + - [827, 8448.64] - - [4096, 3184, 1, 1024] - - [628, 9447.38] + - [806, 9447.38] - - [1024, 3423, 1, 4096] - - [647, 8465.38] + - [825, 8465.38] - - [4096, 3351, 1, 1024] - - [626, 9282.06] + - [804, 9282.06] - - [4096, 3416, 1, 1024] - - [626, 9446.64] + - [804, 9446.64] - - [1024, 3796, 1, 4096] - - [628, 8820.34] + - [806, 8820.34] - - [4096, 3257, 1, 1024] - - [626, 9671.64] + - [804, 9671.64] - - [4096, 3306, 1, 1024] - - [628, 9795.51] + - [806, 9795.51] - - [33708, 4020, 1, 1024] - - [628, 9961.85] + - [806, 9961.85] - - [19, 19, 3264, 64] - - [658, 1736.09] + - [836, 1736.09] - - [1024, 3426, 1, 4096] - - [646, 8518.61] + - [824, 8518.61] - - [4096, 3457, 1, 1024] - - [626, 9564.56] + - [804, 9564.56] - - [1024, 2935, 1, 4096] - - [631, 9067.79] + - [809, 9067.79] - - [1024, 3046, 1, 4096] - - [631, 9242.97] + - [809, 9242.97] - - [4096, 3433, 1, 1024] - - [628, 9495.65] + - [806, 9495.65] - - [1024, 3256, 1, 4096] - - [649, 8224.23] + - [827, 8224.23] - - [1024, 3531, 1, 4096] - - [646, 8524.19] + - [824, 8524.19] - - [4096, 3180, 1, 1024] - - [626, 9443.53] + - [804, 9443.53] - - [1024, 3388, 1, 4096] - - [648, 8352.82] + - [826, 8352.82] - - [4096, 3444, 1, 1024] - - [629, 9511.03] + - [807, 9511.03] - - [1024, 3501, 1, 4096] - - [636, 8461.12] + - [814, 8461.12] - - [1024, 3266, 1, 4096] - - [634, 8147.44] + - [812, 8147.44] - - [1024, 3267, 1, 4096] - - [649, 8391.49] + - [827, 8391.49] - - [1024, 3461, 1, 4096] - - [633, 8270.29] + - [811, 8270.29] - - [4096, 3870, 1, 1024] - - [628, 9399.69] + - [806, 9399.69] - - [4096, 3517, 1, 1024] - - [626, 9725.43] + - [804, 9725.43] - - [1024, 3566, 1, 4096] - - [649, 8669.76] + - [827, 8669.76] - - [4096, 3574, 1, 1024] - - [626, 9844.63] + - [804, 9844.63] - - [1024, 3876, 1, 1024] - - [631, 8961.74] + - [809, 8961.74] - - [25, 25, 2512, 64] - - [657, 2472.54] + - [835, 2472.54] - - [4096, 3720, 1, 1024] - - [626, 9612.49] + - [804, 9612.49] - - [4096, 3248, 1, 1024] - - [628, 9644.92] + - [806, 9644.92] - - [4096, 4059, 1, 1024] - - [626, 9826.42] + - [804, 9826.42] - - [1024, 3380, 1, 4096] - - [647, 8677.91] + - [825, 8677.91] - - [4096, 3480, 1, 1024] - - [628, 9626.16] + - [806, 9626.16] - - [1024, 3335, 1, 4096] - - [648, 8302.18] + - [826, 8302.18] - - [1024, 3345, 1, 4096] - - [648, 8323.13] + - [826, 8323.13] - - [4096, 3391, 1, 1024] - - [626, 9379.48] + - [804, 9379.48] - - [4096, 3424, 1, 1024] - - [628, 9466.77] + - [806, 9466.77] - - [1024, 3394, 1, 4096] - - [634, 8373.91] + - [812, 8373.91] - - [4096, 3265, 1, 1024] - - [628, 9700.89] + - [806, 9700.89] - - [1024, 3014, 1, 4096] - - [631, 9303.09] + - [809, 9303.09] - - [4096, 3497, 1, 1024] - - [626, 9668.6] + - [804, 9668.6] - - [4096, 3354, 1, 1024] - - [628, 9294.31] + - [806, 9294.31] - - [4096, 3055, 1, 1024] - - [627, 9780.88] + - [805, 9780.88] - - [1024, 3499, 1, 4096] - - [640, 8527.04] + - [818, 8527.04] - - [1024, 3162, 1, 4096] - - [648, 8059.02] + - [826, 8059.02] - - [4096, 3244, 1, 1024] - - [628, 9636.86] + - [806, 9636.86] - - [1024, 3437, 1, 4096] - - [647, 8583.41] + - [825, 8583.41] - - [1024, 3356, 1, 4096] - - [649, 8296.95] + - [827, 8296.95] - - [4096, 3139, 1, 1024] - - [628, 9338.7] + - [806, 9338.7] - - [4096, 3508, 1, 1024] - - [628, 9700.54] + - [806, 9700.54] - - [1024, 3235, 1, 4096] - - [646, 8314.59] + - [824, 8314.59] - - [1024, 3910, 1, 4096] - - [633, 9200.21] + - [811, 9200.21] - - [4096, 3371, 1, 1024] - - [626, 9336.97] + - [804, 9336.97] - - [1024, 3751, 1, 4096] - - [633, 8827.67] + - [811, 8827.67] - - [4096, 3325, 1, 1024] - - [626, 9845.68] + - [804, 9845.68] - - [1024, 3413, 1, 4096] - - [634, 8345.78] + - [812, 8345.78] - - [1024, 3542, 1, 4096] - - [646, 8521.71] + - [824, 8521.71] - - [18, 18, 3440, 64] - - [662, 1578.24] + - [840, 1578.24] - - [101, 102, 624, 64] - - [660, 4705.28] + - [838, 4705.28] - - [33708, 3900, 1, 1024] - - [626, 9951.05] + - [804, 9951.05] - - [4096, 3525, 1, 1024] - - [627, 9744.47] + - [805, 9744.47] - - [4096, 3382, 1, 1024] - - [627, 9359.03] + - [805, 9359.03] - - [102, 100, 624, 64] - - [661, 4671.51] + - [839, 4671.51] - - [15, 15, 4096, 64] - - [665, 1129.17] + - [843, 1129.17] - - [1024, 3339, 1, 4096] - - [635, 8326.37] + - [813, 8326.37] - - [4096, 3288, 1, 1024] - - [628, 9761.48] + - [806, 9761.48] - - [92, 92, 688, 64] - - [668, 4903.87] + - [846, 4903.87] - - [1024, 3141, 1, 4096] - - [646, 7975.64] + - [824, 7975.64] - - [1024, 3168, 1, 4096] - - [646, 8083.74] + - [824, 8083.74] - - [4096, 3488, 1, 1024] - - [628, 9646.77] + - [806, 9646.77] - - [4096, 3046, 1, 1024] - - [627, 9767.58] + - [805, 9767.58] - - [1024, 3362, 1, 4096] - - [649, 8458.15] + - [827, 8458.15] - - [33708, 3942, 1, 1024] - - [627, 10060.4] + - [805, 10060.4] - - [4096, 3399, 1, 1024] - - [628, 9406.57] + - [806, 9406.57] - - [1024, 3720, 1, 1024] - - [630, 8639.16] + - [808, 8639.16] - - [4096, 3563, 1, 1024] - - [626, 9836.55] + - [804, 9836.55] - - [1024, 3273, 1, 4096] - - [649, 8221.62] + - [827, 8221.62] - - [4096, 3162, 1, 1024] - - [628, 9400.19] + - [806, 9400.19] - - [1024, 3467, 1, 4096] - - [647, 8342.42] + - [825, 8342.42] - - [1024, 3130, 1, 4096] - - [648, 7933.88] + - [826, 7933.88] - - [1024, 3405, 1, 4096] - - [655, 8406.59] + - [833, 8406.59] - - [4096, 3362, 1, 1024] - - [626, 9312.04] + - [804, 9312.04] - - [1024, 3960, 1, 1024] - - [630, 9082.26] + - [808, 9082.26] - - [2048, 128, 1, 4096] - - [680, 5986.62] + - [858, 5986.62] - - [1024, 3712, 1, 36548] - - [678, 9456.25] + - [856, 9456.25] - - [1024, 128, 1, 1024] - - [681, 3631.53] + - [859, 3631.53] - - [3072, 128, 1, 4096] - - [677, 6145.6] + - [855, 6145.6] - - [1024, 3712, 1, 1024] - - [679, 8933.98] + - [857, 8933.98] + - - [256, 256, 192, 64] + - [862, 8264.74] + - - [768, 4096, 1, 768] + - [875, 9642.18] + - - [768, 64, 1, 768] + - [872, 1850.53] + - - [768, 1280, 1, 768] + - [875, 8738.23] + - - [30522, 320, 1, 768] + - [876, 9733.69] + - - [128, 128, 96, 64] + - [865, 5470.93] + - - [2, 16, 1, 768] + - [868, 2.57742] + - - [30522, 1280, 1, 768] + - [874, 10128.0] + - - [30522, 640, 1, 768] + - [875, 9987.71] + - - [2, 8, 1, 768] + - [867, 1.06] + - - [768, 4096, 1, 3072] + - [877, 9479.51] + - - [768, 32, 1, 768] + - [871, 880.434] + - - [2, 64, 1, 768] + - [868, 10.09024] + - - [256, 256, 96, 64] + - [862, 7614.57] + - - [64, 64, 768, 64] + - [864, 5354.53] + - - [30522, 160, 1, 768] + - [873, 7740.21] + - - [768, 320, 1, 768] + - [866, 5423.77] + - - [128, 128, 384, 64] + - [863, 7180.08] + - - [768, 16, 1, 768] + - [869, 706.476] + - - [3072, 4096, 1, 768] + - [878, 9961.84] + - - [2048, 512, 1, 100] + - [880, 5180.81] + - - [1024, 200, 1, 560] + - [881, 4061.29] + - - [256, 1280, 1, 1024] + - [888, 4337.54] + - - [256, 44505, 1, 1024] + - [924, 8597.79] + - - [10240, 8976, 1, 256] + - [927, 9471.53] + - - [256, 7168, 1, 1024] + - [918, 6718.66] + - - [8448, 8976, 1, 256] + - [910, 9601.41] + - - [18944, 8976, 1, 256] + - [919, 9666.36] + - - [256, 19200, 1, 1024] + - [895, 7489.04] + - - [5632, 8976, 1, 256] + - [907, 9358.49] + - - [256, 23552, 1, 1024] + - [922, 7980.99] + - - [256, 6656, 1, 1024] + - [922, 6287.32] + - - [256, 14336, 1, 1024] + - [917, 7049.36] + - - [256, 12544, 1, 1024] + - [895, 6728.57] + - - [2048, 684, 1, 768] + - [912, 8479.28] + - - [5376, 8976, 1, 256] + - [907, 9519.61] + - - [256, 5888, 1, 1024] + - [927, 6012.5] + - - [19968, 8976, 1, 256] + - [919, 9684.77] + - - [3840, 8976, 1, 256] + - [904, 9461.99] + - - [4608, 8976, 1, 256] + - [904, 9305.92] + - - [256, 684, 1, 1024] + - [930, 3513.16] + - - [256, 22016, 1, 1024] + - [895, 7643.89] + - - [256, 23296, 1, 1024] + - [924, 8048.22] + - - [4864, 8976, 1, 256] + - [902, 9545.72] + - - [256, 7424, 1, 1024] + - [920, 6770.75] + - - [18176, 8976, 1, 256] + - [927, 9729.57] + - - [256, 15104, 1, 1024] + - [916, 7289.18] + - - [8192, 8976, 1, 256] + - [919, 9395.59] + - - [256, 16128, 1, 1024] + - [919, 7461.38] + - - [13312, 8976, 1, 256] + - [927, 9551.07] + - - [256, 21504, 1, 1024] + - [924, 7636.03] + - - [6400, 8976, 1, 256] + - [911, 9561.06] + - - [256, 8960, 1, 1024] + - [886, 6292.46] + - - [1792, 8976, 1, 256] + - [901, 9372.28] + - - [13824, 8976, 1, 256] + - [919, 9585.37] + - - [11776, 8976, 1, 256] + - [919, 9560.44] + - - [256, 20992, 1, 1024] + - [917, 7490.75] + - - [20480, 8976, 1, 256] + - [927, 9610.8] + - - [5888, 8976, 1, 256] + - [898, 9565.3] + - - [256, 10496, 1, 1024] + - [889, 6632.06] + - - [21248, 8976, 1, 256] + - [919, 9755.87] + - - [5120, 8976, 1, 256] + - [927, 9244.69] + - - [7168, 8976, 1, 256] + - [919, 9388.52] + - - [2048, 1536, 1, 768] + - [908, 9446.14] + - - [256, 8192, 1, 1024] + - [913, 6948.99] + - - [4096, 8976, 1, 256] + - [918, 9116.04] + - - [3328, 8976, 1, 256] + - [911, 9434.65] + - - [1280, 8976, 1, 256] + - [909, 9129.9] + - - [2560, 8976, 1, 256] + - [906, 9199.58] + - - [3072, 8976, 1, 256] + - [921, 8963.7] + - - [256, 11776, 1, 1024] + - [899, 6869.9] + - - [18688, 8976, 1, 256] + - [927, 9726.31] + - - [15104, 8976, 1, 256] + - [927, 9715.81] + - - [23552, 8976, 1, 256] + - [919, 9648.52] + - - [6144, 8976, 1, 256] + - [927, 9339.9] + - - [12544, 8976, 1, 256] + - [927, 9654.55] + - - [256, 11264, 1, 1024] + - [900, 6815.08] + - - [2048, 114, 1, 512] + - [931, 4583.6] + - - [4352, 8976, 1, 256] + - [911, 9471.5] + - - [15360, 8976, 1, 256] + - [927, 9583.87] + - - [256, 31488, 1, 1024] + - [926, 8438.11] + - - [28672, 8976, 1, 256] + - [919, 9688.95] + - - [256, 18176, 1, 1024] + - [895, 7405.19] + - - [9728, 8976, 1, 256] + - [927, 9524.25] + - - [256, 2816, 1, 1024] + - [891, 5405.76] + - - [256, 18944, 1, 1024] + - [895, 7503.51] + - - [256, 3584, 1, 1024] + - [894, 6107.25] + - - [7936, 8976, 1, 256] + - [907, 9608.41] + - - [19712, 8976, 1, 256] + - [927, 9736.35] + - - [256, 14848, 1, 1024] + - [900, 7163.52] + - - [256, 8448, 1, 1024] + - [900, 6372.66] + - - [256, 6400, 1, 1024] + - [914, 6395.81] + - - [256, 6144, 1, 1024] + - [925, 6490.32] + - - [9472, 8976, 1, 256] + - [904, 9610.02] + - - [256, 9984, 1, 1024] + - [887, 6484.85] + - - [684, 8976, 1, 256] + - [896, 8128.63] + - - [20992, 8976, 1, 256] + - [919, 9689.75] + - - [2048, 684, 1, 512] + - [903, 7241.88] + - - [2048, 114, 1, 768] + - [929, 4872.56] + - - [8960, 8976, 1, 256] + - [902, 9603.45] + - - [2048, 1536, 1, 512] + - [905, 8830.21] + - - [256, 3328, 1, 1024] + - [893, 5612.65] + - - [33536, 8976, 1, 256] + - [919, 9797.81] + - - [2048, 8976, 1, 256] + - [919, 8975.56] + - - [10496, 8976, 1, 256] + - [910, 9654.53] + - - [256, 5376, 1, 1024] + - [928, 5626.44] + - - [256, 21248, 1, 1024] + - [897, 7525.55] + - - [256, 13312, 1, 1024] + - [895, 6767.21] + - - [16128, 8976, 1, 256] + - [919, 9715.67] + - - [2304, 8976, 1, 256] + - [892, 9433.93] + - - [256, 4864, 1, 1024] + - [882, 5743.65] + - - [17152, 8976, 1, 256] + - [927, 9709.04] + - - [15872, 8976, 1, 256] + - [927, 9657.67] + - - [9984, 8976, 1, 256] + - [904, 9639.84] + - - [256, 14592, 1, 1024] + - [916, 7224.02] + - - [256, 33536, 1, 1024] + - [923, 8147.41] + - - [11264, 8976, 1, 256] + - [919, 9510.06] + - - [31488, 8976, 1, 256] + - [927, 9799.41] + - - [256, 20480, 1, 1024] + - [900, 7498.3] + - - [44505, 8976, 1, 256] + - [911, 9804.88] + - - [13568, 8976, 1, 256] + - [919, 9680.34] + - - [256, 11520, 1, 1024] + - [899, 6805.36] + - - [256, 7936, 1, 1024] + - [915, 6971.87] + - - [2048, 256, 1, 768] + - [885, 7129.23] + - - [256, 4608, 1, 1024] + - [883, 5463.01] + - - [256, 2304, 1, 1024] + - [890, 4842.79] + - - [256, 2560, 1, 1024] + - [891, 5309.35] + - - [2816, 8976, 1, 256] + - [902, 9409.66] + - - [1728, 320, 1, 64] + - [938, 3205.67] + - - [1152, 128, 1, 784] + - [985, 3499.06] + - - [576, 96, 1, 5329] + - [971, 3948.02] + - - [864, 96, 1, 1225] + - [992, 3009.77] + - - [256, 128, 1, 784] + - [982, 1536.59] + - - [1440, 320, 1, 196] + - [935, 4824.72] + - - [192, 48, 1, 1225] + - [1013, 820.565] + - - [2592, 384, 1, 289] + - [953, 7353.11] + - - [192, 80, 36, 10368] + - [1003, 5360.14] + - - [896, 192, 1, 289] + - [970, 3076.66] + - - [768, 128, 1, 289] + - [995, 2351.91] + - - [64, 256, 1, 3136] + - [1021, 1809.26] + - - [1280, 384, 1, 64] + - [935, 3171.2] + - - [512, 144, 1, 196] + - [993, 1445.17] + - - [1344, 192, 1, 289] + - [976, 4376.62] + - - [288, 64, 1, 21609] + - [987, 3396.22] + - - [400, 32, 1, 784] + - [1014, 922.453] + - - [288, 32, 1, 21609] + - [1025, 2816.11] + - - [1280, 448, 1, 64] + - [938, 3253.66] + - - [3456, 256, 1, 169] + - [950, 5822.54] + - - [2304, 256, 1, 196] + - [948, 4932.08] + - - [384, 192, 1, 1225] + - [996, 2720.49] + - - [832, 48, 1, 49] + - [991, 344.618] + - - [832, 192, 1, 49] + - [973, 1099.46] + - - [1280, 192, 1, 64] + - [974, 2069.66] + - - [192, 32, 1, 784] + - [1013, 459.727] + - - [288, 48, 1, 1225] + - [1020, 1176.1] + - - [512, 112, 1, 196] + - [988, 1277.31] + - - [224, 192, 36, 2592] + - [1005, 7369.66] + - - [528, 32, 1, 196] + - [979, 440.474] + - - [192, 128, 36, 1568] + - [1004, 8245.86] + - - [4032, 384, 1, 64] + - [949, 5898.34] + - - [576, 64, 1, 3136] + - [994, 2671.21] + - - [2048, 32, 1, 1001] + - [996, 2323.1] + - - [480, 64, 1, 196] + - [981, 752.74] + - - [512, 256, 1, 196] + - [983, 2528.65] + - - [864, 96, 1, 289] + - [993, 1958.5] + - - [896, 128, 1, 289] + - [996, 2725.83] + - - [192, 64, 1, 784] + - [1011, 898.775] + - - [1200, 64, 1, 1225] + - [995, 2780.24] + - - [1296, 288, 1, 196] + - [934, 3826.28] + - - [576, 96, 1, 5041] + - [975, 3795.68] + - - [1024, 256, 1, 289] + - [964, 4488.23] + - - [1024, 2048, 1, 49] + - [954, 5077.2] + - - [192, 64, 36, 6272] + - [998, 7515.08] + - - [4096, 512, 1, 4096] + - [960, 10276.1] + - - [192, 32, 1, 1225] + - [1014, 556.786] + - - [1024, 256, 1, 196] + - [974, 3892.54] + - - [1120, 192, 1, 289] + - [963, 3752.91] + - - [400, 48, 1, 196] + - [988, 480.1] + - - [1728, 224, 1, 1225] + - [941, 5575.87] + - - [800, 96, 1, 784] + - [995, 2669.04] + - - [1152, 384, 1, 64] + - [945, 3077.44] + - - [4608, 512, 1, 49] + - [952, 4676.7] + - - [1792, 256, 1, 289] + - [945, 5346.04] + - - [864, 128, 1, 784] + - [995, 3816.3] + - - [1728, 384, 1, 169] + - [947, 5191.78] + - - [480, 16, 1, 196] + - [1016, 241.331] + - - [1568, 256, 1, 289] + - [935, 4723.51] + - - [1152, 448, 1, 64] + - [941, 3356.82] + - - [512, 64, 1, 196] + - [980, 802.916] + - - [1344, 224, 1, 289] + - [935, 3519.73] + - - [9216, 512, 1, 4096] + - [958, 9146.12] + - - [27, 32, 1, 22201] + - [1026, 264.456] + - - [1152, 192, 1, 784] + - [965, 4904.18] + - - [1536, 256, 1, 64] + - [933, 2578.57] + - - [800, 128, 1, 196] + - [995, 1991.21] + - - [800, 64, 1, 196] + - [990, 1150.93] + - - [864, 208, 1, 196] + - [967, 2684.82] + - - [1440, 320, 1, 49] + - [936, 2313.54] + - - [512, 128, 1, 784] + - [986, 2780.42] + - - [720, 192, 1, 5041] + - [961, 5410.56] + - - [256, 64, 1, 784] + - [1018, 1163.6] + - - [256, 48, 1, 1225] + - [1013, 1075.3] + - - [576, 192, 1, 3136] + - [961, 4833.11] + - - [160, 64, 1, 5329] + - [1015, 1753.6] + - - [3456, 384, 1, 289] + - [955, 7341.85] + - - [32, 32, 36, 43808] + - [1009, 1378.13] + - - [1344, 512, 1, 64] + - [934, 3823.03] + - - [192, 16, 1, 784] + - [1014, 228.173] + - - [3456, 384, 1, 169] + - [951, 6675.12] + - - [1152, 256, 1, 196] + - [944, 3211.36] + - - [1728, 192, 1, 1225] + - [945, 4852.36] + - - [2048, 512, 1, 49] + - [957, 3471.74] + - - [576, 96, 1, 1225] + - [988, 2176.76] + - - [512, 2048, 1, 49] + - [939, 3845.93] + - - [1728, 192, 1, 64] + - [934, 2369.93] + - - [832, 256, 1, 49] + - [964, 1433.7] + - - [512, 128, 1, 196] + - [989, 1459.77] + - - [1200, 128, 1, 49] + - [984, 1069.19] + - - [528, 256, 1, 196] + - [972, 2069.86] + - - [256, 512, 1, 784] + - [995, 4538.99] + - - [480, 192, 1, 196] + - [995, 1792.1] + - - [96, 64, 36, 2592] + - [1002, 4845.51] + - - [96, 96, 36, 2592] + - [1007, 5111.63] + - - [1024, 192, 1, 289] + - [969, 3431.24] + - - [1536, 384, 1, 64] + - [940, 3166.94] + - - [192, 96, 1, 784] + - [980, 881.24] + - - [2048, 192, 1, 64] + - [937, 2330.27] + - - [192, 64, 1, 1225] + - [1019, 1100.45] + - - [512, 32, 1, 196] + - [1010, 477.967] + - - [128, 96, 36, 1568] + - [1006, 6649.19] + - - [528, 128, 1, 196] + - [992, 1403.33] + - - [128, 512, 1, 784] + - [982, 2237.91] + - - [128, 128, 36, 3136] + - [999, 6538.87] + - - [528, 160, 1, 196] + - [996, 1642.77] + - - [448, 64, 1, 5329] + - [971, 3264.91] + - - [1280, 320, 1, 64] + - [935, 2777.05] + - - [1792, 320, 1, 289] + - [947, 5205.0] + - - [2880, 320, 1, 64] + - [943, 4337.04] + - - [147, 64, 1, 12544] + - [1024, 2430.37] + - - [4096, 512, 1, 1001] + - [959, 9619.09] + - - [1536, 32, 1, 1001] + - [996, 1757.28] + - - [512, 160, 1, 196] + - [992, 1592.99] + - - [768, 160, 1, 289] + - [993, 2757.27] + - - [1728, 384, 1, 49] + - [945, 3102.59] + - - [64, 32, 36, 43808] + - [1000, 2626.53] + - - [64, 64, 1, 3136] + - [1012, 610.606] + - - [256, 32, 1, 784] + - [1013, 612.937] + - - [480, 96, 1, 196] + - [988, 1055.2] + - - [1024, 32, 1, 1001] + - [978, 1188.53] + - - [832, 160, 1, 49] + - [993, 959.347] + - - [512, 1024, 1, 196] + - [936, 4978.8] + - - [96, 64, 36, 10368] + - [1030, 5001.05] + - - [384, 448, 36, 512] + - [1035, 8903.1] + - - [2048, 64, 1, 1001] + - [1028, 4385.23] + - - [224, 192, 36, 5184] + - [1034, 7487.91] + - - [2048, 128, 1, 1001] + - [1027, 5764.73] + - - [96, 96, 36, 10368] + - [1036, 5275.31] + - - [192, 80, 36, 20736] + - [1032, 5409.5] + - - [96, 64, 36, 5184] + - [1030, 4911.93] + - - [1536, 64, 1, 1001] + - [1029, 3162.13] + - - [96, 64, 36, 20736] + - [1031, 5034.43] + - - [384, 448, 36, 256] + - [1033, 8815.97] + - - [96, 96, 36, 5184] + - [1037, 5236.12] - null